summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKarel Kočí <karel.koci@nic.cz>2020-02-03 18:08:30 +0100
committerKarel Kočí <karel.koci@nic.cz>2020-02-03 18:08:30 +0100
commit9bf065ea78812d4e2975ef64dcbe7f309b2e872c (patch)
treeccc6b7ab5954f30311c9267358b4127e22b08c16
parentee30d32cd67d3dc1c704d1ef4ceb1b38959e3c49 (diff)
downloadalpine-personal-pkgs-9bf065ea78812d4e2975ef64dcbe7f309b2e872c.tar.gz
alpine-personal-pkgs-9bf065ea78812d4e2975ef64dcbe7f309b2e872c.tar.bz2
alpine-personal-pkgs-9bf065ea78812d4e2975ef64dcbe7f309b2e872c.zip
ffmpeg-rpi: try to use upstream version instead of LibreELEC one
-rw-r--r--ffmpeg-rpi/0001-libavutil-clean-up-unused-FF_SYMVER-macro.patch55
-rw-r--r--ffmpeg-rpi/APKBUILD59
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch55
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch48
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1003-pfcd_hevc_optimisations.patch44620
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1004-added_upstream_mvc_patches.patch284
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1008-dav1d-enable-av1.patch407
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch60
-rw-r--r--ffmpeg-rpi/ffmpeg-99.1010-yuv2rgb-logspam.patch13
9 files changed, 78 insertions, 45523 deletions
diff --git a/ffmpeg-rpi/0001-libavutil-clean-up-unused-FF_SYMVER-macro.patch b/ffmpeg-rpi/0001-libavutil-clean-up-unused-FF_SYMVER-macro.patch
new file mode 100644
index 0000000..9cc6fdf
--- /dev/null
+++ b/ffmpeg-rpi/0001-libavutil-clean-up-unused-FF_SYMVER-macro.patch
@@ -0,0 +1,55 @@
+From ab11be0becb90542f10d5713659b559842c53af2 Mon Sep 17 00:00:00 2001
+From: Natanael Copa <ncopa@alpinelinux.org>
+Date: Tue, 29 Mar 2016 15:15:17 +0200
+Subject: [PATCH] libavutil: clean up unused FF_SYMVER macro
+
+There is nothing using it since commit d63443b9 (lavc: drop the
+av_fast_{re,m}alloc compatibility wrappers).
+
+Signed-off-by: Natanael Copa <ncopa@alpinelinux.org>
+---
+ libavutil/internal.h | 28 ----------------------------
+ 1 file changed, 28 deletions(-)
+
+diff --git a/libavutil/internal.h b/libavutil/internal.h
+index 61784b5..69d63d5 100644
+--- a/libavutil/internal.h
++++ b/libavutil/internal.h
+@@ -187,34 +187,6 @@
+ #endif
+
+ /**
+- * Define a function with only the non-default version specified.
+- *
+- * On systems with ELF shared libraries, all symbols exported from
+- * FFmpeg libraries are tagged with the name and major version of the
+- * library to which they belong. If a function is moved from one
+- * library to another, a wrapper must be retained in the original
+- * location to preserve binary compatibility.
+- *
+- * Functions defined with this macro will never be used to resolve
+- * symbols by the build-time linker.
+- *
+- * @param type return type of function
+- * @param name name of function
+- * @param args argument list of function
+- * @param ver version tag to assign function
+- */
+-#if HAVE_SYMVER_ASM_LABEL
+-# define FF_SYMVER(type, name, args, ver) \
+- type ff_##name args __asm__ (EXTERN_PREFIX #name "@" ver); \
+- type ff_##name args
+-#elif HAVE_SYMVER_GNU_ASM
+-# define FF_SYMVER(type, name, args, ver) \
+- __asm__ (".symver ff_" #name "," EXTERN_PREFIX #name "@" ver); \
+- type ff_##name args; \
+- type ff_##name args
+-#endif
+-
+-/**
+ * Return NULL if a threading library has not been enabled.
+ * Used to disable threading functions in AVCodec definitions
+ * when not needed.
+--
+2.7.4
+
diff --git a/ffmpeg-rpi/APKBUILD b/ffmpeg-rpi/APKBUILD
index 1cdc921..ff65c73 100644
--- a/ffmpeg-rpi/APKBUILD
+++ b/ffmpeg-rpi/APKBUILD
@@ -1,17 +1,13 @@
-# Contributor: Sergei Lukin <sergej.lukin@gmail.com>
-# Contributor: Łukasz Jendrysik <scadu@yandex.com>
-# Contributor: Jakub Skrzypnik <j.skrzypnik@openmailbox.org>
-# Maintainer: Natanael Copa <ncopa@alpinelinux.org>
+# Maintainer: Karel Kočí <cynerd@email.cz>
pkgname=ffmpeg-rpi
-pkgver="4.0.4"
+pkgver=4.2.2
pkgrel=0
pkgdesc="Complete and free Internet live audio and video broadcasting solution for Linux/Unix - Raspberry Pi"
url="https://ffmpeg.org/"
-arch="aarch64"
+arch="armhf aarch64"
license="GPL-2.0-or-later AND LGPL-2.1-or-later"
-options="!check" # tests/data/hls-lists.append.m3u8 fails
+options="!check"
subpackages="$pkgname-dev $pkgname-doc $pkgname-libs"
-provides="ffmpeg"
makedepends="
alsa-lib-dev
coreutils
@@ -25,35 +21,29 @@ makedepends="
libvorbis-dev
libvpx-dev
libxfixes-dev
+ opus-dev
perl-dev
+ raspberrypi-dev
sdl2-dev
- v4l-utils-dev
x264-dev
x265-dev
xvidcore-dev
yasm
zlib-dev
- raspberrypi-dev
"
-srcver="4.0.4-Leia-18.4"
-source="https://github.com/xbmc/FFmpeg/archive/$srcver.tar.gz
- ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
- ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
- ffmpeg-99.1003-pfcd_hevc_optimisations.patch
- ffmpeg-99.1004-added_upstream_mvc_patches.patch
- ffmpeg-99.1008-dav1d-enable-av1.patch
- ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
- ffmpeg-99.1010-yuv2rgb-logspam.patch
-"
+checkdepends="rsync"
+source="https://ffmpeg.org/releases/ffmpeg-$pkgver.tar.xz
+ 0001-libavutil-clean-up-unused-FF_SYMVER-macro.patch
+ "
# add support for AV1 codec for all archies except armhf and armv7
# as aom is not available on them
_aom="";
-unpack() {
- default_unpack
- mv "FFmpeg-$srcver" "$builddir"
-}
+case "$CARCH" in
+ aarch64)
+ _aom="--enable-libaom"; makedepends="$makedepends aom-dev" ;;
+esac
build() {
local _dbg="--disable-debug"
@@ -61,6 +51,7 @@ build() {
./configure \
--prefix=/usr \
+ --enable-avresample \
--enable-avfilter \
--enable-gnutls \
--enable-gpl \
@@ -72,7 +63,6 @@ build() {
--enable-libx264 \
--enable-libx265 \
--enable-libtheora \
- --enable-libv4l2 \
--enable-postproc \
--enable-pic \
--enable-pthreads \
@@ -85,11 +75,14 @@ build() {
--disable-vaapi \
--disable-vdpau \
--disable-libopus \
- --enable-rpi \
- --extra-cflags="-I/opt/vc/include -Wno-error" \
+ --disable-v4l2_m2m \
+ --enable-omx \
+ --enable-omx-rpi \
+ --enable-mmal \
+ --extra-cflags="-I/opt/vc/include -I/opt/vc/include/IL" \
--extra-ldflags="-L/opt/vc/lib" \
--extra-libs="-lbcm_host -lvcos -lvchiq_arm -lmmal -lmmal_core -lmmal_util -lvcsm" \
- $_dbg
+ $_aom $_dbg
make
${CC:-gcc} -o tools/qt-faststart $CFLAGS tools/qt-faststart.c
make doc/ffmpeg.1 doc/ffplay.1
@@ -107,11 +100,5 @@ libs() {
mv "$pkgdir"/usr/lib "$subpkgdir"/usr
}
-sha512sums="c6dd75dbff7119adeeda246cfb640b5e8d3b4c242ef83e5ba070207b60f0c715c3fe3cb328d87687b70a133f122a03aba990f0e95f3aac7d5dbcee25bec59078 4.0.4-Leia-18.4.tar.gz
-b0d3441b8dcb457254dcb965fbfedcb5bd2bc3bb9b6ae18aacba12d314e01d39a42bc25a4d927413e2783dd9e900a00801a9de254e9fbceb5e6e8b5532fdf31e ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
-4ff179c139ec878a9022ebfba69a515bff4c6f8dcfbe1c9b65e7be818a4a4a087589cc33e9164030cff71dda81ce3751038b90f778feba4b7817c5f8341641dd ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
-9c946a401283b7d2b4e4a9691c25a52ca988fba75414f8453a38998747496377f0556994564388ad18453282b003c70bbd11433bf79d65cd12197a213028848e ffmpeg-99.1003-pfcd_hevc_optimisations.patch
-5a34fe204e7e488db6b6abca01518a946739c1a57e67e8e56c8fa3c78395f04447780773f7b7f5713a009945b34c6993ac5231073961ae514223c1f351919764 ffmpeg-99.1004-added_upstream_mvc_patches.patch
-eb7e71371f395a37c179876f500857132f8b4e5dc3bc8641cbfbd897302808fc31c4fec329fcb1974aba19a5f64fcbf23d6f6a21b9e6fdac5beb94e6aab31c8d ffmpeg-99.1008-dav1d-enable-av1.patch
-36c8814d90cff693dbe6a7c6eb50682a0edceef6dfa435e4ef24f42c7d478693bdae9294178ab3e91b7eb2365bed6ef3580d3eec229392d27c83c610eebb3362 ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
-19b4d4bbdaa0ed64f37eae87d9b68e593e645569df11692fbbb3d3d2d53f627d3820a28a643e36216ea5780f8c5c18ed24cc66ef6ef1b898716fe57750decfbd ffmpeg-99.1010-yuv2rgb-logspam.patch"
+sha512sums="381cd6732fa699eb89000621cf34256920596ed1f9de3c2194dbad35fdf2165269eb7d3a147a0eb75dc18fbb6d601382b5801750e09fc63547766842f84208e3 ffmpeg-4.2.2.tar.xz
+1047a23eda51b576ac200d5106a1cd318d1d5291643b3a69e025c0a7b6f3dbc9f6eb0e1e6faa231b7e38c8dd4e49a54f7431f87a93664da35825cc2e9e8aedf4 0001-libavutil-clean-up-unused-FF_SYMVER-macro.patch"
diff --git a/ffmpeg-rpi/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch b/ffmpeg-rpi/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
deleted file mode 100644
index 37b53e8..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1001-Call-get_format-to-fix-an-issue-with-MMAL-ren.patch
+++ /dev/null
@@ -1,55 +0,0 @@
-From 7adc8f706efab65d8d7e5f960690faca3d5c190d Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Sat, 4 Mar 2017 19:24:02 +0000
-Subject: [PATCH] ffmpeg: Call get_format to fix an issue with MMAL rendering
-
----
- libavcodec/dvdec.c | 7 +++++++
- libavcodec/rv34.c | 6 +++++-
- 2 files changed, 12 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/dvdec.c b/libavcodec/dvdec.c
-index 0b4c1bc..00081ef 100644
---- a/libavcodec/dvdec.c
-+++ b/libavcodec/dvdec.c
-@@ -49,6 +49,7 @@
- #include "internal.h"
- #include "put_bits.h"
- #include "simple_idct.h"
-+#include "thread.h"
-
- typedef struct BlockInfo {
- const uint32_t *factor_table;
-@@ -196,6 +197,12 @@ static av_cold int dvvideo_decode_init(AVCodecContext *avctx)
- s->idct_put[0] = idsp.idct_put;
- s->idct_put[1] = ff_simple_idct248_put;
-
-+ static const enum AVPixelFormat pix_fmts[] = {
-+ AV_PIX_FMT_YUV420P,
-+ AV_PIX_FMT_NONE
-+ };
-+ avctx->pix_fmt = ff_get_format(avctx, pix_fmts);
-+
- return ff_dvvideo_init(avctx);
- }
-
-diff --git a/libavcodec/rv34.c b/libavcodec/rv34.c
-index aca8382..f473f6c 100644
---- a/libavcodec/rv34.c
-+++ b/libavcodec/rv34.c
-@@ -1493,7 +1493,11 @@ av_cold int ff_rv34_decode_init(AVCodecContext *avctx)
- ff_mpv_decode_init(s, avctx);
- s->out_format = FMT_H263;
-
-- avctx->pix_fmt = AV_PIX_FMT_YUV420P;
-+ static const enum AVPixelFormat pix_fmts[] = {
-+ AV_PIX_FMT_YUV420P,
-+ AV_PIX_FMT_NONE
-+ };
-+ avctx->pix_fmt = ff_get_format(avctx, pix_fmts);
- avctx->has_b_frames = 1;
- s->low_delay = 0;
-
---
-2.7.4
-
diff --git a/ffmpeg-rpi/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch b/ffmpeg-rpi/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
deleted file mode 100644
index 6721c8d..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1002-mpeg4video-Signal-unsupported-GMC-with-more-than-one.patch
+++ /dev/null
@@ -1,48 +0,0 @@
-From d8bdcc8791c501921ee8961f3b0de0bd47668ebf Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Fri, 5 Jun 2015 22:48:33 +0100
-Subject: [PATCH] mpeg4video: Signal unsupported GMC with more than one warp
- point
-
----
- libavcodec/avcodec.h | 1 +
- libavcodec/mpeg4videodec.c | 4 ++++
- 2 files changed, 5 insertions(+)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index c26b6d607c..6c4b011b5c 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -2965,6 +2965,7 @@ typedef struct AVCodecContext {
- #define FF_BUG_MS 8192 ///< Work around various bugs in Microsoft's broken decoders.
- #define FF_BUG_TRUNCATED 16384
- #define FF_BUG_IEDGE 32768
-+#define FF_BUG_GMC_UNSUPPORTED (1<<30)
-
- /**
- * strictly follow the standard (MPEG-4, ...).
-diff --git a/libavcodec/mpeg4videodec.c b/libavcodec/mpeg4videodec.c
-index cd39131d55..d8c8227cb4 100644
---- a/libavcodec/mpeg4videodec.c
-+++ b/libavcodec/mpeg4videodec.c
-@@ -2250,6 +2250,9 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
-
- if (ctx->divx_version >= 0)
- s->workaround_bugs |= FF_BUG_HPEL_CHROMA;
-+
-+ if (ctx->num_sprite_warping_points > 1)
-+ s->workaround_bugs |= FF_BUG_GMC_UNSUPPORTED;
- }
-
- if (s->workaround_bugs & FF_BUG_STD_QPEL) {
-@@ -2274,6 +2277,7 @@ int ff_mpeg4_workaround_bugs(AVCodecContext *avctx)
- s->workaround_bugs, ctx->lavc_build, ctx->xvid_build,
- ctx->divx_version, ctx->divx_build, s->divx_packed ? "p" : "");
-
-+ avctx->workaround_bugs = s->workaround_bugs;
- if (CONFIG_MPEG4_DECODER && ctx->xvid_build >= 0 &&
- s->codec_id == AV_CODEC_ID_MPEG4 &&
- avctx->idct_algo == FF_IDCT_AUTO) {
---
-2.14.1
-
diff --git a/ffmpeg-rpi/ffmpeg-99.1003-pfcd_hevc_optimisations.patch b/ffmpeg-rpi/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
deleted file mode 100644
index c64d5e5..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1003-pfcd_hevc_optimisations.patch
+++ /dev/null
@@ -1,44620 +0,0 @@
-diff --git a/.gitignore b/.gitignore
-index 0e57cb0b4c..b2e3374fea 100644
---- a/.gitignore
-+++ b/.gitignore
-@@ -1,6 +1,7 @@
- *.a
- *.o
- *.o.*
-+*.bin
- *.d
- *.def
- *.dll
-@@ -26,6 +27,7 @@
- .\#*
- /.config
- /.version
-+/build/
- /ffmpeg
- /ffplay
- /ffprobe
-diff --git a/configure b/configure
-index 827abfe694..28f630068e 100755
---- a/configure
-+++ b/configure
-@@ -318,6 +318,7 @@ External library support:
- --enable-libmfx enable Intel MediaSDK (AKA Quick Sync Video) code via libmfx [no]
- --enable-libnpp enable Nvidia Performance Primitives-based code [no]
- --enable-mmal enable Broadcom Multi-Media Abstraction Layer (Raspberry Pi) via MMAL [no]
-+ --enable-rpi enable other rpi specific stuff [no]
- --disable-nvdec disable Nvidia video decoding acceleration (via hwaccel) [autodetect]
- --disable-nvenc disable Nvidia video encoding code [autodetect]
- --enable-omx enable OpenMAX IL code [no]
-@@ -1776,6 +1777,7 @@ FEATURE_LIST="
- gray
- hardcoded_tables
- omx_rpi
-+ rpi
- runtime_cpudetect
- safe_bitstream_reader
- shared
-@@ -2293,6 +2295,7 @@ CONFIG_EXTRA="
- rtpdec
- rtpenc_chain
- rv34dsp
-+ sand
- sinewin
- snappy
- srtp
-@@ -2610,6 +2613,8 @@ hap_decoder_select="snappy texturedsp"
- hap_encoder_deps="libsnappy"
- hap_encoder_select="texturedspenc"
- hevc_decoder_select="bswapdsp cabac golomb hevcparse videodsp"
-+hevc_rpi_decoder_deps="rpi"
-+hevc_rpi_decoder_select="hevc_decoder sand"
- huffyuv_decoder_select="bswapdsp huffyuvdsp llviddsp"
- huffyuv_encoder_select="bswapdsp huffman huffyuvencdsp llvidencdsp"
- iac_decoder_select="imc_decoder"
-@@ -3393,6 +3398,8 @@ tinterlace_filter_deps="gpl"
- tinterlace_merge_test_deps="tinterlace_filter"
- tinterlace_pad_test_deps="tinterlace_filter"
- tonemap_filter_deps="const_nan"
-+unsand_filter_deps="rpi"
-+unsand_filter_select="sand"
- unsharp_opencl_filter_deps="opencl"
- uspp_filter_deps="gpl avcodec"
- vaguedenoiser_filter_deps="gpl"
-diff --git a/fftools/ffmpeg.c b/fftools/ffmpeg.c
-index c0214c42d8..faaea5772a 100644
---- a/fftools/ffmpeg.c
-+++ b/fftools/ffmpeg.c
-@@ -24,6 +24,12 @@
- */
-
- #include "config.h"
-+
-+#if CONFIG_RPI
-+#define RPI_DISPLAY
-+#define RPI_DISPLAY_ALL 0
-+#endif
-+
- #include <ctype.h>
- #include <string.h>
- #include <math.h>
-@@ -70,6 +76,25 @@
- # include "libavfilter/buffersrc.h"
- # include "libavfilter/buffersink.h"
-
-+#ifdef RPI_DISPLAY
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#include <bcm_host.h>
-+#include <interface/mmal/mmal.h>
-+#include <interface/mmal/mmal_parameters_camera.h>
-+#include <interface/mmal/mmal_buffer.h>
-+#include <interface/mmal/mmal_port.h>
-+#include <interface/mmal/util/mmal_util.h>
-+#include <interface/mmal/util/mmal_default_components.h>
-+#include <interface/mmal/util/mmal_connection.h>
-+#include <interface/mmal/util/mmal_util_params.h>
-+#pragma GCC diagnostic pop
-+#include "libavcodec/rpi_qpu.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "libavcodec/rpi_zc.h"
-+#endif
-+
- #if HAVE_SYS_RESOURCE_H
- #include <sys/time.h>
- #include <sys/types.h>
-@@ -162,6 +187,241 @@ static int restore_tty;
- static void free_input_threads(void);
- #endif
-
-+#ifdef RPI_DISPLAY
-+
-+#define NUM_BUFFERS 4
-+
-+
-+typedef struct rpi_display_env_s
-+{
-+ MMAL_COMPONENT_T* display;
-+ MMAL_COMPONENT_T* isp;
-+ MMAL_PORT_T * port_in; // Input port of either isp or display depending on pipe setup
-+ MMAL_CONNECTION_T * conn;
-+
-+ MMAL_POOL_T *rpi_pool;
-+ volatile int rpi_display_count;
-+ enum AVPixelFormat avfmt;
-+} rpi_display_env_t;
-+
-+static rpi_display_env_t * rpi_display_env = NULL;
-+
-+
-+static MMAL_POOL_T* display_alloc_pool(MMAL_PORT_T* port)
-+{
-+ MMAL_POOL_T* pool;
-+ mmal_port_parameter_set_boolean(port, MMAL_PARAMETER_ZERO_COPY, MMAL_TRUE); // Does this mark that the buffer contains a vc_handle? Would have expected a vc_image?
-+ pool = mmal_port_pool_create(port, NUM_BUFFERS, 0);
-+ assert(pool);
-+
-+ return pool;
-+}
-+
-+static void display_cb_input(MMAL_PORT_T *port, MMAL_BUFFER_HEADER_T *buffer) {
-+ rpi_display_env_t *const de = (rpi_display_env_t *)port->userdata;
-+ av_rpi_zc_unref(buffer->user_data);
-+ atomic_fetch_add(&de->rpi_display_count, -1);
-+ mmal_buffer_header_release(buffer);
-+}
-+
-+static void display_cb_control(MMAL_PORT_T *port,MMAL_BUFFER_HEADER_T *buffer) {
-+ mmal_buffer_header_release(buffer);
-+}
-+
-+#define DISPLAY_PORT_DEPTH 4
-+
-+static rpi_display_env_t *
-+display_init(const enum AVPixelFormat req_fmt, size_t x, size_t y, size_t w, size_t h)
-+{
-+ MMAL_STATUS_T err;
-+ MMAL_DISPLAYREGION_T region =
-+ {
-+ .hdr = {MMAL_PARAMETER_DISPLAYREGION, sizeof(region)},
-+ .set = MMAL_DISPLAY_SET_LAYER | MMAL_DISPLAY_SET_FULLSCREEN | MMAL_DISPLAY_SET_DEST_RECT,
-+ .layer = 2,
-+ .fullscreen = 0,
-+ .dest_rect = {x, y, w, h}
-+ };
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10 || av_rpi_is_sand_format(req_fmt)) ? AV_PIX_FMT_SAND128 : req_fmt;
-+#else
-+ const enum AVPixelFormat fmt = (req_fmt == AV_PIX_FMT_YUV420P10) ? AV_PIX_FMT_SAND128 : req_fmt;
-+#endif
-+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(fmt, w, h);
-+ rpi_display_env_t * de;
-+ int isp_req = (fmt == AV_PIX_FMT_SAND64_10);
-+
-+ bcm_host_init(); // Needs to be done by someone...
-+
-+ if ((de = av_mallocz(sizeof(*de))) == NULL) {
-+ return NULL;
-+ }
-+
-+ mmal_component_create(MMAL_COMPONENT_DEFAULT_VIDEO_RENDERER, &de->display);
-+ av_assert0(de->display);
-+ de->port_in = de->display->input[0];
-+
-+ if (isp_req)
-+ {
-+ mmal_component_create("vc.ril.isp", &de->isp);
-+ de->port_in = de->isp->input[0];
-+ }
-+
-+ mmal_port_parameter_set(de->display->input[0], &region.hdr);
-+
-+ {
-+ MMAL_PORT_T * const port = de->port_in;
-+ MMAL_ES_FORMAT_T* const format = port->format;
-+ port->userdata = (struct MMAL_PORT_USERDATA_T *)de;
-+ port->buffer_num = DISPLAY_PORT_DEPTH;
-+ format->encoding = fmt == AV_PIX_FMT_SAND128 ? MMAL_ENCODING_YUVUV128 :
-+ fmt == AV_PIX_FMT_SAND64_10 ? MMAL_ENCODING_YUVUV64_16 :
-+ MMAL_ENCODING_I420;
-+ format->es->video.width = geo.stride_y;
-+ format->es->video.height = (fmt == AV_PIX_FMT_SAND128 || fmt == AV_PIX_FMT_SAND64_10) ?
-+ (h + 15) & ~15 : geo.height_y; // Magic
-+ format->es->video.crop.x = 0;
-+ format->es->video.crop.y = 0;
-+ format->es->video.crop.width = w;
-+ format->es->video.crop.height = h;
-+ mmal_port_format_commit(port);
-+ }
-+
-+ de->rpi_pool = display_alloc_pool(de->port_in);
-+ mmal_port_enable(de->port_in,display_cb_input);
-+
-+ if (isp_req) {
-+ MMAL_PORT_T * const port_out = de->isp->output[0];
-+ mmal_log_dump_port(de->port_in);
-+ mmal_format_copy(port_out->format, de->port_in->format);
-+ if (fmt == AV_PIX_FMT_SAND64_10) {
-+ if ((err = mmal_port_parameter_set_int32(de->port_in, MMAL_PARAMETER_CCM_SHIFT, 5)) != MMAL_SUCCESS ||
-+ (err = mmal_port_parameter_set_int32(port_out, MMAL_PARAMETER_OUTPUT_SHIFT, 1)) != MMAL_SUCCESS)
-+ {
-+ av_log(NULL, AV_LOG_WARNING, "Failed to set ISP output port shift\n");
-+ }
-+ else
-+ av_log(NULL, AV_LOG_WARNING, "Set ISP output port shift OK\n");
-+
-+ }
-+ port_out->format->encoding = MMAL_ENCODING_I420;
-+ mmal_log_dump_port(port_out);
-+ if ((err = mmal_port_format_commit(port_out)) != MMAL_SUCCESS)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "Failed to set ISP output port format\n");
-+ goto fail;
-+ }
-+ if ((err = mmal_connection_create(&de->conn, port_out, de->display->input[0], MMAL_CONNECTION_FLAG_TUNNELLING)) != MMAL_SUCCESS) {
-+ av_log(NULL, AV_LOG_ERROR, "Failed to create connection\n");
-+ goto fail;
-+ }
-+ if ((err = mmal_connection_enable(de->conn)) != MMAL_SUCCESS) {
-+ av_log(NULL, AV_LOG_ERROR, "Failed to enable connection\n");
-+ goto fail;
-+ }
-+ mmal_port_enable(de->isp->control,display_cb_control);
-+ mmal_component_enable(de->isp);
-+ }
-+
-+ mmal_component_enable(de->display);
-+ mmal_port_enable(de->display->control,display_cb_control);
-+ de->avfmt = fmt;
-+
-+ printf("Allocated display %dx%d in %dx%d, fmt=%d\n", w, h, geo.stride_y, geo.height_y, fmt);
-+
-+ return de;
-+
-+fail:
-+ // **** Free stuff
-+ return NULL;
-+}
-+
-+static void display_frame(struct AVCodecContext * const s, rpi_display_env_t * const de, const AVFrame* const fr)
-+{
-+ MMAL_BUFFER_HEADER_T* buf;
-+
-+ if (de == NULL)
-+ return;
-+
-+ if (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+ av_log(s, AV_LOG_VERBOSE, "Frame dropped\n");
-+ return;
-+ }
-+
-+ buf = mmal_queue_get(de->rpi_pool->queue);
-+ if (!buf) {
-+ // Running too fast so drop the frame
-+ printf("Q alloc failure\n");
-+ return;
-+ }
-+ assert(buf);
-+ buf->cmd = 0;
-+ buf->offset = 0; // Offset to valid data
-+ buf->flags = 0;
-+ {
-+ const AVRpiZcRefPtr fr_buf = av_rpi_zc_ref(s, fr, de->avfmt, 1);
-+ if (fr_buf == NULL) {
-+ mmal_buffer_header_release(buf);
-+ return;
-+ }
-+
-+ buf->user_data = fr_buf;
-+ buf->data = (uint8_t *)av_rpi_zc_vc_handle(fr_buf); // Cast our handle to a pointer for mmal
-+ buf->offset = av_rpi_zc_offset(fr_buf);
-+ buf->length = av_rpi_zc_length(fr_buf);
-+ buf->alloc_size = av_rpi_zc_numbytes(fr_buf);
-+ atomic_fetch_add(&de->rpi_display_count, 1);
-+ }
-+#if RPI_DISPLAY_ALL
-+ while (atomic_load(&de->rpi_display_count) >= DISPLAY_PORT_DEPTH - 1) {
-+ usleep(5000);
-+ }
-+#endif
-+
-+ if (mmal_port_send_buffer(de->port_in, buf) != MMAL_SUCCESS)
-+ {
-+ av_log(s, AV_LOG_ERROR, "mmal_port_send_buffer failed: depth=%d\n", de->rpi_display_count);
-+ display_cb_input(de->port_in, buf);
-+ }
-+}
-+
-+static void display_exit(rpi_display_env_t ** const pde)
-+{
-+ rpi_display_env_t * const de = *pde;
-+ *pde = NULL;
-+
-+ if (de != NULL) {
-+// sleep(120);
-+
-+ if (de->port_in != NULL) {
-+ mmal_port_disable(de->port_in);
-+ }
-+
-+ // The above disable should kick out all buffers - check that
-+ if (atomic_load(&de->rpi_display_count) != 0) {
-+ av_log(NULL, AV_LOG_WARNING, "Exiting with display count non-zero:%d\n", atomic_load(&de->rpi_display_count));
-+ }
-+
-+ if (de->conn != NULL) {
-+ mmal_connection_destroy(de->conn);
-+ }
-+ if (de->rpi_pool != NULL) {
-+ mmal_port_pool_destroy(de->display->input[0], de->rpi_pool);
-+ }
-+ if (de->isp != NULL) {
-+ mmal_component_destroy(de->isp);
-+ }
-+ if (de->display != NULL) {
-+ mmal_component_destroy(de->display);
-+ }
-+
-+ av_free(de);
-+ }
-+}
-+
-+#endif
-+
-+
- /* sub2video hack:
- Convert subtitles to video with alpha to insert them in filter graphs.
- This is a temporary solution until libavfilter gets real subtitles support.
-@@ -583,6 +843,11 @@ static void ffmpeg_cleanup(int ret)
- avformat_close_input(&input_files[i]->ctx);
- av_freep(&input_files[i]);
- }
-+
-+#ifdef RPI_DISPLAY
-+ display_exit(&rpi_display_env);
-+#endif
-+
- for (i = 0; i < nb_input_streams; i++) {
- InputStream *ist = input_streams[i];
-
-@@ -594,7 +859,9 @@ static void ffmpeg_cleanup(int ret)
- av_freep(&ist->filters);
- av_freep(&ist->hwaccel_device);
- av_freep(&ist->dts_buffer);
--
-+#ifdef RPI_DISPLAY
-+ av_rpi_zc_uninit(ist->dec_ctx);
-+#endif
- avcodec_free_context(&ist->dec_ctx);
-
- av_freep(&input_streams[i]);
-@@ -625,6 +892,7 @@ static void ffmpeg_cleanup(int ret)
- }
- term_exit();
- ffmpeg_exited = 1;
-+
- }
-
- void remove_avoptions(AVDictionary **a, AVDictionary *b)
-@@ -1060,6 +1328,17 @@ static void do_video_out(OutputFile *of,
- if (ost->source_index >= 0)
- ist = input_streams[ost->source_index];
-
-+#ifdef RPI_DISPLAY
-+ if (next_picture && ist != NULL)
-+ {
-+ if (rpi_display_env == NULL)
-+ rpi_display_env = display_init(next_picture->format, 0, 0,
-+ next_picture->width - next_picture->crop_right,
-+ next_picture->height - next_picture->crop_bottom);
-+ display_frame(ist->dec_ctx, rpi_display_env, next_picture);
-+ }
-+#endif
-+
- frame_rate = av_buffersink_get_frame_rate(filter);
- if (frame_rate.num > 0 && frame_rate.den > 0)
- duration = 1/(av_q2d(frame_rate) * av_q2d(enc->time_base));
-@@ -2132,8 +2411,8 @@ static int ifilter_send_frame(InputFilter *ifilter, AVFrame *frame)
- ifilter->channel_layout != frame->channel_layout;
- break;
- case AVMEDIA_TYPE_VIDEO:
-- need_reinit |= ifilter->width != frame->width ||
-- ifilter->height != frame->height;
-+ need_reinit |= ifilter->width != av_frame_cropped_width(frame) ||
-+ ifilter->height != av_frame_cropped_height(frame);
- break;
- }
-
-@@ -2891,6 +3170,12 @@ static int init_input_stream(int ist_index, char *error, int error_len)
- ist->dec_ctx->opaque = ist;
- ist->dec_ctx->get_format = get_format;
- ist->dec_ctx->get_buffer2 = get_buffer;
-+
-+#ifdef RPI_DISPLAY
-+ // Overrides the above get_buffer2
-+ av_rpi_zc_init(ist->dec_ctx);
-+#endif
-+
- ist->dec_ctx->thread_safe_callbacks = 1;
-
- av_opt_set_int(ist->dec_ctx, "refcounted_frames", 1, 0);
-diff --git a/fftools/ffmpeg_filter.c b/fftools/ffmpeg_filter.c
-index 877fd670e6..1efd3a43a8 100644
---- a/fftools/ffmpeg_filter.c
-+++ b/fftools/ffmpeg_filter.c
-@@ -1179,8 +1179,8 @@ int ifilter_parameters_from_frame(InputFilter *ifilter, const AVFrame *frame)
-
- ifilter->format = frame->format;
-
-- ifilter->width = frame->width;
-- ifilter->height = frame->height;
-+ ifilter->width = av_frame_cropped_width(frame);
-+ ifilter->height = av_frame_cropped_height(frame);
- ifilter->sample_aspect_ratio = frame->sample_aspect_ratio;
-
- ifilter->sample_rate = frame->sample_rate;
-diff --git a/fftools/ffmpeg_opt.c b/fftools/ffmpeg_opt.c
-index d7a7eb0662..3949c9e76b 100644
---- a/fftools/ffmpeg_opt.c
-+++ b/fftools/ffmpeg_opt.c
-@@ -684,11 +684,19 @@ static AVCodec *choose_decoder(OptionsContext *o, AVFormatContext *s, AVStream *
-
- MATCH_PER_STREAM_OPT(codec_names, str, codec_name, s, st);
- if (codec_name) {
-+ if (strcmp("hevc_rpi", codec_name) == 0) {
-+ return avcodec_find_decoder_by_id_and_fmt(AV_CODEC_ID_HEVC, st->codecpar->format);
-+ }
- AVCodec *codec = find_codec_or_die(codec_name, st->codecpar->codec_type, 0);
- st->codecpar->codec_id = codec->id;
- return codec;
- } else
-+ {
-+ if (st->codecpar->codec_id == AV_CODEC_ID_HEVC) {
-+ return avcodec_find_decoder_by_id_and_fmt(st->codecpar->codec_id, st->codecpar->format);
-+ }
- return avcodec_find_decoder(st->codecpar->codec_id);
-+ }
- }
-
- /* Add all the streams from the given input file to the global
-diff --git a/libavcodec/Makefile b/libavcodec/Makefile
-index 4b8ad121db..f6e6784e5a 100644
---- a/libavcodec/Makefile
-+++ b/libavcodec/Makefile
-@@ -6,6 +6,7 @@ HEADERS = ac3_parser.h \
- avcodec.h \
- avdct.h \
- avfft.h \
-+ rpi_zc.h \
- d3d11va.h \
- dirac.h \
- dv_profile.h \
-@@ -128,6 +129,7 @@ OBJS-$(CONFIG_QSVDEC) += qsvdec.o
- OBJS-$(CONFIG_QSVENC) += qsvenc.o
- OBJS-$(CONFIG_RANGECODER) += rangecoder.o
- OBJS-$(CONFIG_RDFT) += rdft.o
-+OBJS-$(CONFIG_RPI) += rpi_qpu.o rpi_mailbox.o rpi_zc.o
- OBJS-$(CONFIG_RV34DSP) += rv34dsp.o
- OBJS-$(CONFIG_SHARED) += log2_tab.o reverse.o
- OBJS-$(CONFIG_SINEWIN) += sinewin.o sinewin_fixed.o
-@@ -360,6 +362,13 @@ OBJS-$(CONFIG_HAP_ENCODER) += hapenc.o hap.o
- OBJS-$(CONFIG_HEVC_DECODER) += hevcdec.o hevc_mvs.o \
- hevc_cabac.o hevc_refs.o hevcpred.o \
- hevcdsp.o hevc_filter.o hevc_data.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER) += rpi_hevcdec.o rpi_hevc_mvs.o \
-+ rpi_hevc_cabac.o rpi_hevc_refs.o rpi_hevcpred.o \
-+ rpi_hevcdsp.o rpi_hevc_filter.o rpi_hevc_data.o \
-+ rpi_hevc_shader.o rpi_hevc_shader_template.o \
-+ rpi_hevc_parse.o h2645_parse.o rpi_hevc_ps.o \
-+ rpi_hevc_sei.o rpi_hevc_data.o
-+OBJS-$(CONFIG_HEVC_CUVID_DECODER) += cuvid.o
- OBJS-$(CONFIG_HEVC_AMF_ENCODER) += amfenc_hevc.o
- OBJS-$(CONFIG_HEVC_CUVID_DECODER) += cuviddec.o
- OBJS-$(CONFIG_HEVC_MEDIACODEC_DECODER) += mediacodecdec.o
-@@ -1188,3 +1197,31 @@ $(SUBDIR)qdm2.o: $(SUBDIR)qdm2_tables.h
- $(SUBDIR)sinewin.o: $(SUBDIR)sinewin_tables.h
- $(SUBDIR)sinewin_fixed.o: $(SUBDIR)sinewin_fixed_tables.h
- endif
-+
-+ifdef CONFIG_HEVC_RPI_DECODER
-+QASM_PY := ../local/bin/qasm.py
-+VASMVIDCORE := ../local/bin/vasmvidcore_std
-+
-+ifneq ("$(wildcard $(QASM_PY))","")
-+$(SUBDIR)rpi_hevc_shader.c: $(SUBDIR)rpi_hevc_shader.qasm
-+ $(QASM_PY) -mc_c:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+
-+$(SUBDIR)rpi_hevc_shader.h: $(SUBDIR)rpi_hevc_shader.qasm
-+ $(QASM_PY) -mc_h:rpi_hevc_shader,rpi_hevc_shader,ff_hevc_rpi_shader $< > $@
-+endif
-+
-+ifneq ("$(wildcard $(VASMVIDCORE))","")
-+$(SUBDIR)rpi_hevc_transform8.bin: $(SUBDIR)rpi_hevc_transform.s
-+ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=8 $< -o $@
-+$(SUBDIR)rpi_hevc_transform10.bin: $(SUBDIR)rpi_hevc_transform.s
-+ $(VASMVIDCORE) -Fbin -DBIT_DEPTH=10 $< -o $@
-+
-+$(SUBDIR)rpi_hevc_transform8.h: $(SUBDIR)rpi_hevc_transform8.bin
-+ python pi-util/make_array.py $<
-+$(SUBDIR)rpi_hevc_transform10.h: $(SUBDIR)rpi_hevc_transform10.bin
-+ python pi-util/make_array.py $<
-+endif
-+
-+$(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_transform8.h $(SUBDIR)rpi_hevc_transform10.h
-+$(SUBDIR)rpi_hevcdec.o $(SUBDIR)rpi_shader_template.o $(SUBDIR)rpi_qpu.o: $(SUBDIR)rpi_hevc_shader.h
-+endif
-diff --git a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
-index 4d4ef530e4..fba8776c9f 100644
---- a/libavcodec/allcodecs.c
-+++ b/libavcodec/allcodecs.c
-@@ -142,6 +142,7 @@ extern AVCodec ff_h264_qsv_decoder;
- extern AVCodec ff_h264_rkmpp_decoder;
- extern AVCodec ff_hap_encoder;
- extern AVCodec ff_hap_decoder;
-+extern AVCodec ff_hevc_rpi_decoder;
- extern AVCodec ff_hevc_decoder;
- extern AVCodec ff_hevc_qsv_decoder;
- extern AVCodec ff_hevc_rkmpp_decoder;
-@@ -833,6 +834,41 @@ static enum AVCodecID remap_deprecated_codec_id(enum AVCodecID id)
- }
- }
-
-+static int codec_supports_format(const AVCodec * const p, const enum AVPixelFormat fmt)
-+{
-+ const enum AVPixelFormat *pf = p->pix_fmts;
-+
-+ // Assume good if we lack info
-+ if (pf == NULL)
-+ return 1;
-+ if (fmt == AV_PIX_FMT_NONE)
-+ return 0;
-+
-+ for (; *pf != AV_PIX_FMT_NONE; ++pf) {
-+ if (*pf == fmt)
-+ return 1;
-+ }
-+ return 0;
-+}
-+
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt)
-+{
-+ const AVCodec *p, *experimental = NULL;
-+ void *i = 0;
-+
-+ id= remap_deprecated_codec_id(id);
-+ while ((p = av_codec_iterate(&i))) {
-+ if (av_codec_is_decoder(p) && p->id == id && codec_supports_format(p, fmt)) {
-+ if (p->capabilities & AV_CODEC_CAP_EXPERIMENTAL && !experimental) {
-+ experimental = p;
-+ } else
-+ return (AVCodec *)p;
-+ }
-+ p = p->next;
-+ }
-+ return (AVCodec *)experimental;
-+}
-+
- static AVCodec *find_codec(enum AVCodecID id, int (*x)(const AVCodec *))
- {
- const AVCodec *p, *experimental = NULL;
-diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
-index e656011c3c..f8801dfab6 100644
---- a/libavcodec/arm/Makefile
-+++ b/libavcodec/arm/Makefile
-@@ -40,6 +40,8 @@ OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_init_arm.o \
- arm/sbrdsp_init_arm.o
- OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_init_arm.o
- OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_arm.o
-+OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_arm.o \
-+ arm/rpi_hevcpred_init_arm.o
- OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_init_arm.o
- OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_init_arm.o
- OBJS-$(CONFIG_SBC_ENCODER) += arm/sbcdsp_init_arm.o
-@@ -136,10 +138,24 @@ NEON-OBJS-$(CONFIG_AAC_DECODER) += arm/aacpsdsp_neon.o \
- NEON-OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_neon.o
- NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/synth_filter_neon.o
- NEON-OBJS-$(CONFIG_HEVC_DECODER) += arm/hevcdsp_init_neon.o \
-+ arm/hevcdsp_idct_neon.o \
- arm/hevcdsp_deblock_neon.o \
- arm/hevcdsp_idct_neon.o \
- arm/hevcdsp_qpel_neon.o \
- arm/hevcdsp_sao_neon.o
-+NEON-OBJS-$(CONFIG_HEVC_RPI_DECODER) += arm/rpi_hevcdsp_init_neon.o \
-+ arm/rpi_hevc_misc_neon.o \
-+ arm/rpi_hevcdsp_deblock_neon.o \
-+ arm/rpi_hevcdsp_idct_neon.o \
-+ arm/rpi_hevcdsp_res8_neon.o \
-+ arm/rpi_hevcdsp_res16_neon.o \
-+ arm/rpi_hevcdsp_sao_neon.o \
-+ arm/rpi_hevcpred_init_neon.o \
-+ arm/rpi_hevcpred_intra_angular_neon.o \
-+ arm/rpi_hevcpred_intra_dc_neon.o \
-+ arm/rpi_hevcpred_intra_filter_neon.o \
-+ arm/rpi_hevcpred_intra_hv_neon.o \
-+ arm/rpi_hevcpred_intra_planar_neon.o
- NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o
- NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv34dsp_neon.o \
- arm/rv40dsp_neon.o
-diff --git a/libavcodec/arm/cabac.h b/libavcodec/arm/cabac.h
-index fdbf86b45e..4755f20e2e 100644
---- a/libavcodec/arm/cabac.h
-+++ b/libavcodec/arm/cabac.h
-@@ -26,83 +26,209 @@
- #include "libavutil/internal.h"
- #include "libavcodec/cabac.h"
-
-+
- #define get_cabac_inline get_cabac_inline_arm
- static av_always_inline int get_cabac_inline_arm(CABACContext *c,
-- uint8_t *const state)
-+ uint8_t *state)
- {
-- int bit;
-- void *reg_b, *reg_c, *tmp;
-+ const uint8_t *mlps_tables = ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128;
-+ int bit, ptr, low, tmp1, tmp2;
-+ __asm__ volatile (
-+ "ldr %[bit], [%[c], %[range_off]] \n\t"
-+ "ldrb %[ptr], [%[state]] \n\t"
-+ "sub %[tmp1], %[mlps_tables], %[lps_off] \n\t"
-+ "and %[tmp2], %[bit], #0xc0 \n\t"
-+ "add %[tmp1], %[tmp1], %[ptr] \n\t"
-+ "ldr %[low], [%[c], %[low_off]] \n\t"
-+ "ldrb %[tmp2], [%[tmp1], %[tmp2], lsl #1] \n\t"
-+ "sub %[bit], %[bit], %[tmp2] \n\t"
-+ "mov %[tmp1], %[bit] \n\t"
-+ "cmp %[low], %[bit], lsl #17 \n\t"
-+ "itt ge \n\t"
-+ "movge %[tmp1], %[tmp2] \n\t"
-+ "mvnge %[ptr], %[ptr] \n\t"
-+ "clz %[tmp2], %[tmp1] \n\t"
-+ "it ge \n\t"
-+ "subge %[low], %[low], %[bit], lsl #17 \n\t"
-+ "sub %[tmp2], %[tmp2], #23 \n\t"
-+ "and %[bit], %[ptr], #1 \n\t"
-+ "ldrb %[mlps_tables], [%[mlps_tables], %[ptr]] \n\t"
-+ "lsl %[low], %[low], %[tmp2] \n\t"
-+ "lsls %[ptr], %[low], #16 \n\t"
-+ "bne 1f \n\t"
-+ "ldr %[ptr], [%[c], %[ptr_off]] \n\t"
-+ "lsl %[tmp2], %[tmp1], %[tmp2] \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+ "strb %[mlps_tables], [%[state]] \n\t"
-+ "rbit %[state], %[low] \n\t"
-+ "ldrh %[tmp1], [%[ptr]], #2 \n\t"
-+#else
-+ "ldr %[tmp1], [%[c], %[end_off]] \n\t"
-+ "strb %[mlps_tables], [%[state]] \n\t"
-+ "rbit %[state], %[low] \n\t"
-+ "cmp %[tmp1], %[ptr] \n\t"
-+#if CONFIG_THUMB
-+ "it cs \n\t"
-+ "ldrhcs %[tmp1], [%[ptr]], #2 \n\t"
-+#else
-+ "ldrcsh %[tmp1], [%[ptr]], #2 \n\t"
-+#endif
-+#endif
-+ "clz %[state], %[state] \n\t"
-+ "movw %[mlps_tables], #0xffff \n\t"
-+ "sub %[state], %[state], #16 \n\t"
-+ "str %[tmp2], [%[c], %[range_off]] \n\t"
-+ "rev %[tmp1], %[tmp1] \n\t"
-+ "str %[ptr], [%[c], %[ptr_off]] \n\t"
-+ "lsr %[tmp1], %[tmp1], #15 \n\t"
-+ "sub %[tmp1], %[tmp1], %[mlps_tables] \n\t"
-+#if CONFIG_THUMB
-+ "lsl %[tmp1], %[tmp1], %[state] \n\t"
-+ "add %[low], %[low], %[tmp1] \n\t"
-+#else
-+ "add %[low], %[low], %[tmp1], lsl %[state] \n\t"
-+#endif
-+ "str %[low], [%[c], %[low_off]] \n\t"
-+ "b 2f \n\t"
-+ "1: \n\t"
-+ "strb %[mlps_tables], [%[state]] \n\t"
-+ "lsl %[tmp1], %[tmp1], %[tmp2] \n\t"
-+ "str %[low], [%[c], %[low_off]] \n\t"
-+ "str %[tmp1], [%[c], %[range_off]] \n\t"
-+ "2: \n\t"
-+ : // Outputs
-+ [state]"+r"(state),
-+ [mlps_tables]"+r"(mlps_tables),
-+ [bit]"=&r"(bit),
-+ [ptr]"=&r"(ptr),
-+ [low]"=&r"(low),
-+ [tmp1]"=&r"(tmp1),
-+ [tmp2]"=&r"(tmp2)
-+ : // Inputs
-+ [c]"r"(c),
-+ [low_off]"J"(offsetof(CABACContext, low)),
-+ [range_off]"J"(offsetof(CABACContext, range)),
-+ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+ [end_off]"J"(offsetof(CABACContext, bytestream_end)),
-+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ return bit;
-+}
-
-- __asm__ volatile(
-- "ldrb %[bit] , [%[state]] \n\t"
-- "add %[r_b] , %[tables] , %[lps_off] \n\t"
-- "mov %[tmp] , %[range] \n\t"
-- "and %[range] , %[range] , #0xC0 \n\t"
-- "add %[r_b] , %[r_b] , %[bit] \n\t"
-- "ldrb %[range] , [%[r_b], %[range], lsl #1] \n\t"
-- "add %[r_b] , %[tables] , %[norm_off] \n\t"
-- "sub %[r_c] , %[tmp] , %[range] \n\t"
-- "lsl %[tmp] , %[r_c] , #17 \n\t"
-- "cmp %[tmp] , %[low] \n\t"
-- "it gt \n\t"
-- "movgt %[range] , %[r_c] \n\t"
-- "itt cc \n\t"
-- "mvncc %[bit] , %[bit] \n\t"
-- "subcc %[low] , %[low] , %[tmp] \n\t"
-- "add %[r_c] , %[tables] , %[mlps_off] \n\t"
-- "ldrb %[tmp] , [%[r_b], %[range]] \n\t"
-- "ldrb %[r_b] , [%[r_c], %[bit]] \n\t"
-- "lsl %[low] , %[low] , %[tmp] \n\t"
-- "lsl %[range] , %[range] , %[tmp] \n\t"
-- "uxth %[r_c] , %[low] \n\t"
-- "strb %[r_b] , [%[state]] \n\t"
-- "tst %[r_c] , %[r_c] \n\t"
-- "bne 2f \n\t"
-- "ldr %[r_c] , [%[c], %[byte]] \n\t"
-+#define get_cabac_bypass get_cabac_bypass_arm
-+static inline int get_cabac_bypass_arm(CABACContext * const c)
-+{
-+ uint32_t low = c->low, range, ptr, tmp;
-+ int rv;
-+ __asm volatile (
-+ "ldr %[range] , [%[c], %[range_off]] \n\t"
-+ "mov %[rv] , #0 \n\t"
-+ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t"
-+ "lsl %[low] , #1 \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+ "ldr %[tmp] , [%[c], %[end_off]] \n\t"
-+#endif
-+ "cmp %[low] , %[range], lsl #17 \n\t"
-+ "itt cs \n\t"
-+ "subcs %[low] , %[low], %[range], lsl #17 \n\t"
-+ "movcs %[rv] , #1 \n\t"
- #if UNCHECKED_BITSTREAM_READER
-- "ldrh %[tmp] , [%[r_c]] \n\t"
-- "add %[r_c] , %[r_c] , #2 \n\t"
-- "str %[r_c] , [%[c], %[byte]] \n\t"
-+ "ldrh %[tmp] , [%[ptr]], #2 \n\t"
-+#else
-+ "cmp %[tmp] , %[ptr] \n\t"
-+#if CONFIG_THUMB
-+ "it cs \n\t"
-+ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t"
- #else
-- "ldr %[r_b] , [%[c], %[end]] \n\t"
-- "ldrh %[tmp] , [%[r_c]] \n\t"
-- "cmp %[r_c] , %[r_b] \n\t"
-- "itt lt \n\t"
-- "addlt %[r_c] , %[r_c] , #2 \n\t"
-- "strlt %[r_c] , [%[c], %[byte]] \n\t"
-+ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t"
-+#endif
- #endif
-- "sub %[r_c] , %[low] , #1 \n\t"
-- "add %[r_b] , %[tables] , %[norm_off] \n\t"
-- "eor %[r_c] , %[low] , %[r_c] \n\t"
-- "rev %[tmp] , %[tmp] \n\t"
-- "lsr %[r_c] , %[r_c] , #15 \n\t"
-- "lsr %[tmp] , %[tmp] , #15 \n\t"
-- "ldrb %[r_c] , [%[r_b], %[r_c]] \n\t"
-- "movw %[r_b] , #0xFFFF \n\t"
-- "sub %[tmp] , %[tmp] , %[r_b] \n\t"
-- "rsb %[r_c] , %[r_c] , #7 \n\t"
-- "lsl %[tmp] , %[tmp] , %[r_c] \n\t"
-- "add %[low] , %[low] , %[tmp] \n\t"
-- "2: \n\t"
-- : [bit]"=&r"(bit),
-- [low]"+&r"(c->low),
-- [range]"+&r"(c->range),
-- [r_b]"=&r"(reg_b),
-- [r_c]"=&r"(reg_c),
-- [tmp]"=&r"(tmp)
-- : [c]"r"(c),
-- [state]"r"(state),
-- [tables]"r"(ff_h264_cabac_tables),
-- [byte]"M"(offsetof(CABACContext, bytestream)),
-- [end]"M"(offsetof(CABACContext, bytestream_end)),
-- [norm_off]"I"(H264_NORM_SHIFT_OFFSET),
-- [lps_off]"I"(H264_LPS_RANGE_OFFSET),
-- [mlps_off]"I"(H264_MLPS_STATE_OFFSET + 128)
-- : "memory", "cc"
-- );
-+ "lsls %[range] , %[low], #16 \n\t"
-+ "bne 1f \n\t"
-
-- return bit & 1;
-+ "str %[ptr] , [%[c], %[ptr_off]] \n\t"
-+ "rev %[tmp] , %[tmp] \n\t"
-+ "add %[low] , %[low], %[tmp], lsr #15 \n\t"
-+ "movw %[tmp] , 0xFFFF \n\t"
-+ "sub %[low] , %[tmp] \n\t"
-+ "1: \n\t"
-+ "str %[low] , [%[c], %[low_off]] \n\t"
-+ : // Outputs
-+ [rv]"=&r"(rv),
-+ [low]"+r"(low),
-+ [range]"=&r"(range),
-+ [ptr]"=&r"(ptr),
-+ [tmp]"=&r"(tmp)
-+ : // Inputs
-+ [c]"r"(c),
-+ [low_off]"J"(offsetof(CABACContext, low)),
-+ [range_off]"J"(offsetof(CABACContext, range)),
-+ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+ [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+ : // Clobbers
-+ "memory", "cc"
-+ );
-+ return rv;
- }
-+
-+
-+#define get_cabac_bypass_sign get_cabac_bypass_sign_arm
-+static inline int get_cabac_bypass_sign_arm(CABACContext * const c, int rv)
-+{
-+ uint32_t low = c->low, range, ptr, tmp;
-+ __asm volatile (
-+ "ldr %[range] , [%[c], %[range_off]] \n\t"
-+ "ldr %[ptr] , [%[c], %[ptr_off]] \n\t"
-+ "lsl %[low] , #1 \n\t"
-+#if !UNCHECKED_BITSTREAM_READER
-+ "ldr %[tmp] , [%[c], %[end_off]] \n\t"
-+#endif
-+ "cmp %[low] , %[range], lsl #17 \n\t"
-+ "it cs \n\t"
-+ "subcs %[low] , %[low], %[range], lsl #17 \n\t"
-+ "it cc \n\t"
-+ "rsbcc %[rv] , %[rv], #0 \n\t"
-+#if UNCHECKED_BITSTREAM_READER
-+ "ldrh %[tmp] , [%[ptr]], #2 \n\t"
-+#else
-+ "cmp %[tmp] , %[ptr] \n\t"
-+#if CONFIG_THUMB
-+ "it cs \n\t"
-+ "ldrhcs %[tmp] , [%[ptr]], #2 \n\t"
-+#else
-+ "ldrcsh %[tmp] , [%[ptr]], #2 \n\t"
-+#endif
-+#endif
-+ "lsls %[range] , %[low], #16 \n\t"
-+ "bne 1f \n\t"
-+
-+ "str %[ptr] , [%[c], %[ptr_off]] \n\t"
-+ "rev %[tmp] , %[tmp] \n\t"
-+ "add %[low] , %[low], %[tmp], lsr #15 \n\t"
-+ "movw %[tmp] , 0xFFFF \n\t"
-+ "sub %[low] , %[tmp] \n\t"
-+ "1: \n\t"
-+ "str %[low] , [%[c], %[low_off]] \n\t"
-+ : // Outputs
-+ [rv]"+r"(rv),
-+ [low]"+r"(low),
-+ [range]"=&r"(range),
-+ [ptr]"=&r"(ptr),
-+ [tmp]"=&r"(tmp)
-+ : // Inputs
-+ [c]"r"(c),
-+ [low_off]"J"(offsetof(CABACContext, low)),
-+ [range_off]"J"(offsetof(CABACContext, range)),
-+ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+ [end_off]"J"(offsetof(CABACContext, bytestream_end))
-+ : // Clobbers
-+ "memory", "cc"
-+ );
-+ return rv;
-+}
-+
- #endif /* HAVE_ARMV6T2_INLINE */
-
- #endif /* AVCODEC_ARM_CABAC_H */
-diff --git a/libavcodec/arm/rpi_hevc_cabac.h b/libavcodec/arm/rpi_hevc_cabac.h
-new file mode 100644
-index 0000000000..c7df9f1e5a
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_cabac.h
-@@ -0,0 +1,605 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVC_CABAC_H
-+#define AVCODEC_ARM_HEVC_CABAC_H
-+
-+#include "config.h"
-+#if HAVE_ARMV6T2_INLINE
-+
-+#define hevc_mem_bits32 hevc_mem_bits32_arm
-+static inline uint32_t hevc_mem_bits32_arm(const void * p, const unsigned int bits)
-+{
-+ unsigned int n;
-+ __asm__ (
-+ "rev %[n], %[x] \n\t"
-+ : [n]"=r"(n)
-+ : [x]"r"(*(const uint32_t *)((const uint8_t *)p + (bits >> 3)))
-+ :
-+ );
-+ return n << (bits & 7);
-+}
-+
-+
-+// ---------------------------------------------------------------------------
-+//
-+// Helper fns - little bits of code where ARM has an instraction that the
-+// compiler doesn't know about / use
-+
-+#define trans_scale_sat trans_scale_sat_arm
-+static inline int trans_scale_sat_arm(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+ int rv;
-+ int t = ((level * (int)(scale * scale_m)) >> shift) + 1;
-+
-+ __asm__ (
-+ "ssat %[rv], #16, %[t], ASR #1 \n\t"
-+ : [rv]"=r"(rv)
-+ : [t]"r"(t)
-+ :
-+ );
-+ return rv;
-+}
-+
-+#define update_rice update_rice_arm
-+static inline void update_rice_arm(uint8_t * const stat_coeff,
-+ const unsigned int last_coeff_abs_level_remaining,
-+ const unsigned int c_rice_param)
-+{
-+ int t = last_coeff_abs_level_remaining << 1;
-+ __asm__ (
-+ "lsrs %[t], %[t], %[shift] \n\t"
-+
-+ "it eq \n\t"
-+ "subeq %[stat], %[stat], #1 \n\t"
-+ "cmp %[t], #6 \n\t"
-+ "adc %[stat], %[stat], #0 \n\t"
-+ "usat %[stat], #8, %[stat] \n\t"
-+ : [stat]"+r"(*stat_coeff),
-+ [t]"+r"(t)
-+ : [shift]"r"(c_rice_param)
-+ : "cc"
-+ );
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC get loops
-+//
-+// Where the loop is simple enough we can normally do 10-30% better than the
-+// compiler
-+
-+// Get the residual greater than 1 bits
-+
-+#define get_cabac_greater1_bits get_cabac_greater1_bits_arm
-+static inline unsigned int get_cabac_greater1_bits_arm(CABACContext * const c, const unsigned int n,
-+ uint8_t * const state0)
-+{
-+ unsigned int i, reg_b, st, tmp, bit, rv;
-+ __asm__ (
-+ "mov %[i] , #0 \n\t"
-+ "mov %[rv] , #0 \n\t"
-+ "1: \n\t"
-+ "add %[i] , %[i] , #1 \n\t"
-+ "cmp %[rv] , #0 \n\t"
-+ "ite eq \n\t"
-+ "usateq %[st] , #2 , %[i] \n\t"
-+ "movne %[st] , #0 \n\t"
-+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
-+ "and %[tmp] , %[range] , #0xC0 \n\t"
-+
-+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
-+ "add %[r_b] , %[r_b] , %[bit] \n\t"
-+ "ldrb %[tmp] , [%[r_b], %[tmp], lsl #1] \n\t"
-+ "sub %[range] , %[range] , %[tmp] \n\t"
-+
-+ "cmp %[low] , %[range], lsl #17 \n\t"
-+ "ittt ge \n\t"
-+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
-+ "movge %[range] , %[tmp] \n\t"
-+ "mvnge %[bit] , %[bit] \n\t"
-+
-+ "clz %[tmp] , %[range] \n\t"
-+ "sub %[tmp] , #23 \n\t"
-+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
-+ "and %[bit] , %[bit] , #1 \n\t"
-+ "strb %[r_b] , [%[state0], %[st]] \n\t"
-+ "lsl %[low] , %[low] , %[tmp] \n\t"
-+ "orr %[rv] , %[bit] , %[rv], lsl #1 \n\t"
-+ "lsl %[range] , %[range] , %[tmp] \n\t"
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+ "lsls %[tmp] , %[low] , #16 \n\t"
-+ "it ne \n\t"
-+ "cmpne %[n] , %[i] \n\t"
-+ "bne 1b \n\t"
-+
-+// If reload is not required then we must have run out of flags to decode
-+ "tst %[tmp] , %[tmp] \n\t"
-+ "bne 2f \n\t"
-+
-+// Do reload
-+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
-+ "rbit %[bit] , %[low] \n\t"
-+ "movw %[r_b] , #0xFFFF \n\t"
-+ "clz %[bit] , %[bit] \n\t"
-+ "rev %[tmp] , %[tmp] \n\t"
-+ "sub %[bit] , %[bit] , #16 \n\t"
-+ "cmp %[n] , %[i] \n\t"
-+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+ "lsl %[tmp] , %[tmp] , %[bit] \n\t"
-+ "add %[low] , %[low] , %[tmp] \n\t"
-+#else
-+ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ : [bit]"=&r"(bit),
-+ [low]"+r"(c->low),
-+ [range]"+r"(c->range),
-+ [r_b]"=&r"(reg_b),
-+ [bptr]"+r"(c->bytestream),
-+ [i]"=&r"(i),
-+ [tmp]"=&r"(tmp),
-+ [st]"=&r"(st),
-+ [rv]"=&r"(rv)
-+ : [state0]"r"(state0),
-+ [n]"r"(n),
-+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+ : "memory", "cc"
-+ );
-+ return rv;
-+}
-+
-+
-+// n must be > 0 on entry
-+#define get_cabac_sig_coeff_flag_idxs get_cabac_sig_coeff_flag_idxs_arm
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs_arm(CABACContext * const c, uint8_t * const state0,
-+ unsigned int n,
-+ const uint8_t const * ctx_map,
-+ uint8_t * p)
-+{
-+ unsigned int reg_b, tmp, st, bit;
-+ __asm__ (
-+// Get bin from map
-+#if CONFIG_THUMB
-+ "add %[ctx_map] , %[n] \n\t"
-+ "ldrb %[st] , [%[ctx_map]] \n\t"
-+#else
-+ "ldrb %[st] , [%[ctx_map], %[n]]! \n\t"
-+#endif
-+ "1: \n\t"
-+
-+// Load state & ranges
-+ "ldrb %[bit] , [%[state0], %[st]] \n\t"
-+ "and %[tmp] , %[range] , #0xC0 \n\t"
-+ "sub %[r_b] , %[mlps_tables], %[lps_off] \n\t"
-+ "add %[r_b] , %[r_b] , %[tmp], lsl #1 \n\t"
-+ "ldrb %[tmp] , [%[r_b], %[bit]] \n\t"
-+ "sub %[range] , %[range] , %[tmp] \n\t"
-+
-+ "cmp %[low] , %[range], lsl #17 \n\t"
-+ "ittt ge \n\t"
-+ "mvnge %[bit] , %[bit] \n\t"
-+ "subge %[low] , %[low] , %[range], lsl #17 \n\t"
-+ "movge %[range] , %[tmp] \n\t"
-+
-+// Renorm
-+ "clz %[tmp] , %[range] \n\t"
-+ "ldrb %[r_b] , [%[mlps_tables], %[bit]] \n\t"
-+ "sub %[tmp] , #23 \n\t"
-+ "strb %[r_b] , [%[state0], %[st]] \n\t"
-+ "tst %[bit] , #1 \n\t"
-+ "ldrb %[st] , [%[ctx_map], #-1]! \n\t"
-+ "lsl %[low] , %[low] , %[tmp] \n\t"
-+// GCC asm seems to need strbne written differently for thumb and arm
-+#if CONFIG_THUMB
-+ "it ne \n\t"
-+ "strbne %[n] , [%[idx]] , #1 \n\t"
-+#else
-+ "strneb %[n] , [%[idx]] , #1 \n\t"
-+#endif
-+
-+// There is a small speed gain from combining both conditions, using a single
-+// branch and then working out what that meant later
-+ "subs %[n] , %[n] , #1 \n\t"
-+ "lsl %[range] , %[range] , %[tmp] \n\t"
-+#if CONFIG_THUMB
-+ "itt ne \n\t"
-+ "lslsne %[tmp] , %[low] , #16 \n\t"
-+#else
-+ "lslnes %[tmp] , %[low] , #16 \n\t"
-+#endif
-+ "bne 1b \n\t"
-+
-+// If we have bits left then n must be 0 so give up now
-+ "lsls %[tmp] , %[low] , #16 \n\t"
-+ "bne 2f \n\t"
-+
-+// Do reload
-+ "ldrh %[tmp] , [%[bptr]] , #2 \n\t"
-+ "rbit %[bit] , %[low] \n\t"
-+ "movw %[r_b] , #0xFFFF \n\t"
-+ "clz %[bit] , %[bit] \n\t"
-+ "cmp %[n] , #0 \n\t"
-+ "rev %[tmp] , %[tmp] \n\t"
-+ "sub %[bit] , %[bit] , #16 \n\t"
-+ "rsb %[tmp] , %[r_b] , %[tmp], lsr #15 \n\t"
-+
-+#if CONFIG_THUMB
-+ "lsl %[tmp] , %[tmp] , %[bit] \n\t"
-+ "add %[low] , %[low] , %[tmp] \n\t"
-+#else
-+ "add %[low] , %[low] , %[tmp], lsl %[bit] \n\t"
-+#endif
-+
-+// Check to see if we still have more to do
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ : [bit]"=&r"(bit),
-+ [low]"+r"(c->low),
-+ [range]"+r"(c->range),
-+ [r_b]"=&r"(reg_b),
-+ [bptr]"+r"(c->bytestream),
-+ [idx]"+r"(p),
-+ [n]"+r"(n),
-+ [tmp]"=&r"(tmp),
-+ [st]"=&r"(st),
-+ [ctx_map]"+r"(ctx_map)
-+ : [state0]"r"(state0),
-+ [mlps_tables]"r"(ff_h264_cabac_tables + H264_MLPS_STATE_OFFSET + 128),
-+ [lps_off]"I"((H264_MLPS_STATE_OFFSET + 128) - H264_LPS_RANGE_OFFSET)
-+ : "memory", "cc"
-+ );
-+
-+ return p;
-+}
-+
-+// ---------------------------------------------------------------------------
-+//
-+// CABAC_BY22 functions
-+
-+
-+#define get_cabac_by22_start get_cabac_by22_start_arm
-+static inline void get_cabac_by22_start_arm(CABACContext * const c)
-+{
-+ const uint8_t *ptr = c->bytestream;
-+ register uint32_t low __asm__("r1"), range __asm__("r2");
-+ uint32_t m, range8, bits;
-+#if !USE_BY22_DIV
-+ uintptr_t inv;
-+#endif
-+
-+ av_assert2(offsetof (CABACContext, low) == 0);
-+ av_assert2(offsetof (CABACContext, range) == 4);
-+ av_assert2(offsetof (CABACContext, by22.range) == offsetof (CABACContext, by22.bits) + 2);
-+ __asm__ volatile (
-+ "ldmia %[c], {%[low], %[range]} \n\t"
-+ : // Outputs
-+ [low]"=r"(low),
-+ [range]"=r"(range)
-+ : // Inputs
-+ [c]"r"(c)
-+ : // Clobbers
-+ );
-+#if !USE_BY22_DIV
-+ inv = (uintptr_t)cabac_by22_inv_range;
-+#endif
-+ __asm__ volatile (
-+ "ldr %[m], [%[ptr]], #-("AV_STRINGIFY(CABAC_BITS)"/8) \n\t"
-+#if !USE_BY22_DIV
-+ "uxtb %[range8], %[range] \n\t"
-+#endif
-+ "rbit %[bits], %[low] \n\t"
-+ "lsl %[low], %[low], #22 - "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+ "clz %[bits], %[bits] \n\t"
-+ "str %[ptr], [%[c], %[ptr_off]] \n\t"
-+ "rev %[m], %[m] \n\t"
-+ "rsb %[ptr], %[bits], #9 + "AV_STRINGIFY(CABAC_BITS)" \n\t"
-+ "eor %[m], %[m], #0x80000000 \n\t"
-+#if !USE_BY22_DIV
-+ "ldr %[inv], [%[inv], %[range8], lsl #2] \n\t"
-+ "pkhbt %[range], %[bits], %[range], lsl #16 \n\t"
-+ "str %[range], [%[c], %[bits_off]] \n\t"
-+#else
-+ "strh %[bits], [%[c], %[bits_off]] \n\t"
-+#endif
-+#if CONFIG_THUMB
-+ "lsr %[m], %[ptr] \n\t"
-+ "eor %[range], %[low], %[m] \n\t"
-+#else
-+ "eor %[range], %[low], %[m], lsr %[ptr] \n\t"
-+#endif
-+ : // Outputs
-+ [ptr]"+&r"(ptr),
-+ [low]"+&r"(low),
-+ [range]"+&r"(range),
-+#if !USE_BY22_DIV
-+ [inv]"+&r"(inv),
-+#endif
-+ [m]"=&r"(m),
-+ [range8]"=&r"(range8),
-+ [bits]"=&r"(bits)
-+ : // Inputs
-+ [c]"r"(c),
-+ [bits_off]"J"(offsetof (CABACContext, by22.bits)),
-+ [ptr_off]"J"(offsetof (CABACContext, bytestream))
-+ : // Clobbers
-+ "memory"
-+ );
-+ c->low = range;
-+#if !USE_BY22_DIV
-+ c->range = inv;
-+#endif
-+}
-+
-+#define get_cabac_by22_peek get_cabac_by22_peek_arm
-+static inline uint32_t get_cabac_by22_peek_arm(const CABACContext *const c)
-+{
-+ uint32_t rv = c->low &~ 1, tmp;
-+ __asm__ (
-+ "cmp %[inv] , #0 \n\t"
-+ "it ne \n\t"
-+ "umullne %[tmp] , %[rv] , %[inv], %[rv] \n\t"
-+ : // Outputs
-+ [rv]"+r"(rv),
-+ [tmp]"=r"(tmp)
-+ : // Inputs
-+ [inv]"r"(c->range)
-+ : // Clobbers
-+ "cc"
-+ );
-+ return rv << 1;
-+}
-+
-+#define get_cabac_by22_flush get_cabac_by22_flush_arm
-+static inline void get_cabac_by22_flush_arm(CABACContext *const c, const unsigned int n, uint32_t val)
-+{
-+ uint32_t bits, ptr, tmp1, tmp2;
-+ __asm__ volatile (
-+ "ldrh %[bits], [%[cc], %[bits_off]] \n\t"
-+ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
-+ "rsb %[tmp1], %[n], #32 \n\t"
-+ "add %[bits], %[bits], %[n] \n\t"
-+ "ldrh %[tmp2], [%[cc], %[range_off]] \n\t"
-+ "lsr %[tmp1], %[val], %[tmp1] \n\t"
-+ "ldr %[val], [%[cc], %[low_off]] \n\t"
-+#if CONFIG_THUMB
-+ "add %[ptr], %[ptr], %[bits], lsr #3 \n\t"
-+ "ldr %[ptr], [%[ptr]] \n\t"
-+#else
-+ "ldr %[ptr], [%[ptr], %[bits], lsr #3] \n\t"
-+#endif
-+ "mul %[tmp1], %[tmp2], %[tmp1] \n\t"
-+ "and %[tmp2], %[bits], #7 \n\t"
-+ "strh %[bits], [%[cc], %[bits_off]] \n\t"
-+ "rev %[ptr], %[ptr] \n\t"
-+ "lsl %[tmp1], %[tmp1], #23 \n\t"
-+#if CONFIG_THUMB
-+ "lsl %[val], %[n] \n\t"
-+ "sub %[val], %[tmp1] \n\t"
-+#else
-+ "rsb %[val], %[tmp1], %[val], lsl %[n] \n\t"
-+#endif
-+ "lsl %[ptr], %[ptr], %[tmp2] \n\t"
-+ "orr %[val], %[val], %[ptr], lsr #9 \n\t"
-+ "str %[val], [%[cc], %[low_off]] \n\t"
-+ : // Outputs
-+ [val]"+r"(val),
-+ [bits]"=&r"(bits),
-+ [ptr]"=&r"(ptr),
-+ [tmp1]"=&r"(tmp1),
-+ [tmp2]"=&r"(tmp2)
-+ : // Inputs
-+ [cc]"r"(c),
-+ [n]"r"(n),
-+ [bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+ [ptr_off]"J"(offsetof(CABACContext, bytestream)),
-+ [range_off]"J"(offsetof(CABACContext, by22.range)),
-+ [low_off]"J"(offsetof(CABACContext, low))
-+ : // Clobbers
-+ "memory"
-+ );
-+}
-+
-+#define coeff_abs_level_remaining_decode_bypass coeff_abs_level_remaining_decode_bypass_arm
-+static inline int coeff_abs_level_remaining_decode_bypass_arm(CABACContext *const c, unsigned int rice_param)
-+{
-+ uint32_t last_coeff_abs_level_remaining;
-+ uint32_t prefix, n1, range, n2, ptr, tmp1, tmp2;
-+ __asm__ volatile (
-+ "ldr %[remain], [%[cc], %[low_off]] \n\t"
-+ "ldr %[prefix], [%[cc], %[range_off]] \n\t"
-+ "bic %[remain], %[remain], #1 \n\t"
-+ "ldrh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
-+ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
-+ "cmp %[prefix], #0 \n\t"
-+ "it ne \n\t"
-+ "umullne %[prefix], %[remain], %[prefix], %[remain] \n\t"
-+ "ldrh %[range], [%[cc], %[by22_range_off]] \n\t"
-+ "lsl %[remain], %[remain], #1 \n\t"
-+ "mvn %[prefix], %[remain] \n\t"
-+ "clz %[prefix], %[prefix] \n\t"
-+ "rsbs %[n1], %[prefix], #2 \n\t"
-+ "bcc 1f \n\t"
-+ "adc %[n1], %[rice], %[prefix] \n\t"
-+ "add %[tmp2], %[tmp2], %[n1] \n\t"
-+ "rsb %[n2], %[n1], #32 \n\t"
-+ "and %[tmp1], %[tmp2], #7 \n\t"
-+ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
-+ "lsr %[tmp2], %[tmp2], #3 \n\t"
-+ "lsr %[n2], %[remain], %[n2] \n\t"
-+ "mul %[n2], %[range], %[n2] \n\t"
-+ "ldr %[range], [%[cc], %[low_off]] \n\t"
-+ "ldr %[ptr], [%[ptr], %[tmp2]] \n\t"
-+ "rsb %[tmp2], %[rice], #31 \n\t"
-+ "lsl %[remain], %[remain], %[prefix] \n\t"
-+ "lsl %[n2], %[n2], #23 \n\t"
-+#if CONFIG_THUMB
-+ "lsl %[range], %[n1] \n\t"
-+ "sub %[range], %[n2] \n\t"
-+#else
-+ "rsb %[range], %[n2], %[range], lsl %[n1] \n\t"
-+#endif
-+ "rev %[ptr], %[ptr] \n\t"
-+ "lsl %[n2], %[prefix], %[rice] \n\t"
-+#if CONFIG_THUMB
-+ "lsr %[remain], %[tmp2] \n\t"
-+ "add %[remain], %[n2] \n\t"
-+#else
-+ "add %[remain], %[n2], %[remain], lsr %[tmp2] \n\t"
-+#endif
-+ "b 3f \n\t"
-+ "1: \n\t"
-+ "add %[n2], %[rice], %[prefix], lsl #1 \n\t"
-+ "cmp %[n2], %[peek_bits_plus_2] \n\t"
-+ "bhi 2f \n\t"
-+ "sub %[n1], %[n2], #2 \n\t"
-+ "add %[tmp2], %[tmp2], %[n1] \n\t"
-+ "rsb %[n2], %[n1], #32 \n\t"
-+ "strh %[tmp2], [%[cc], %[by22_bits_off]] \n\t"
-+ "lsr %[tmp1], %[tmp2], #3 \n\t"
-+ "lsr %[n2], %[remain], %[n2] \n\t"
-+ "mul %[n2], %[range], %[n2] \n\t"
-+ "rsb %[range], %[rice], #34 \n\t"
-+ "ldr %[ptr], [%[ptr], %[tmp1]] \n\t"
-+ "and %[tmp1], %[tmp2], #7 \n\t"
-+ "lsl %[remain], %[remain], %[prefix] \n\t"
-+ "ldr %[tmp2], [%[cc], %[low_off]] \n\t"
-+ "rsb %[prefix], %[prefix], %[range] \n\t"
-+ "orr %[remain], %[remain], #0x80000000 \n\t"
-+ "rev %[ptr], %[ptr] \n\t"
-+ "lsl %[n2], %[n2], #23 \n\t"
-+ "mov %[range], #2 \n\t"
-+#if CONFIG_THUMB
-+ "lsl %[tmp2], %[n1] \n\t"
-+ "sub %[tmp2], %[n2] \n\t"
-+#else
-+ "rsb %[tmp2], %[n2], %[tmp2], lsl %[n1] \n\t"
-+#endif
-+ "lsl %[ptr], %[ptr], %[tmp1] \n\t"
-+ "lsl %[rice], %[range], %[rice] \n\t"
-+ "orr %[range], %[tmp2], %[ptr], lsr #9 \n\t"
-+#if CONFIG_THUMB
-+ "lsr %[remain], %[prefix] \n\t"
-+ "add %[remain], %[rice] \n\t"
-+#else
-+ "add %[remain], %[rice], %[remain], lsr %[prefix] \n\t"
-+#endif
-+ "b 4f \n\t"
-+ "2: \n\t"
-+ "add %[n1], %[tmp2], %[prefix] \n\t"
-+#if CONFIG_THUMB
-+ "add %[tmp2], %[ptr], %[n1], lsr #3 \n\t"
-+ "ldr %[tmp2], [%[tmp2]] \n\t"
-+#else
-+ "ldr %[tmp2], [%[ptr], %[n1], lsr #3] \n\t"
-+#endif
-+ "rsb %[tmp1], %[prefix], #32 \n\t"
-+ "push {%[rice]} \n\t"
-+ "and %[rice], %[n1], #7 \n\t"
-+ "lsr %[tmp1], %[remain], %[tmp1] \n\t"
-+ "ldr %[ptr], [%[cc], %[low_off]] \n\t"
-+ "mul %[remain], %[range], %[tmp1] \n\t"
-+ "rev %[tmp2], %[tmp2] \n\t"
-+ "rsb %[n2], %[prefix], %[n2] \n\t"
-+ "ldr %[tmp1], [%[cc], %[range_off]] \n\t"
-+ "lsl %[rice], %[tmp2], %[rice] \n\t"
-+ "sub %[tmp2], %[n2], #2 \n\t"
-+ "lsl %[remain], %[remain], #23 \n\t"
-+#if CONFIG_THUMB
-+ "lsl %[ptr], %[prefix] \n\t"
-+ "rsb %[remain], %[ptr] \n\t"
-+#else
-+ "rsb %[remain], %[remain], %[ptr], lsl %[prefix] \n\t"
-+#endif
-+ "orr %[remain], %[remain], %[rice], lsr #9 \n\t"
-+ "add %[prefix], %[n1], %[tmp2] \n\t"
-+ "bic %[n1], %[remain], #1 \n\t"
-+ "ldr %[ptr], [%[cc], %[ptr_off]] \n\t"
-+ "cmp %[tmp1], #0 \n\t"
-+ "rsb %[rice], %[tmp2], #32 \n\t"
-+ "it ne \n\t"
-+ "umullne %[tmp1], %[n1], %[tmp1], %[n1] \n\t"
-+ "and %[tmp1], %[prefix], #7 \n\t"
-+#if CONFIG_THUMB
-+ "add %[ptr], %[ptr], %[prefix], lsr #3 \n\t"
-+ "ldr %[ptr], [%[ptr]] \n\t"
-+#else
-+ "ldr %[ptr], [%[ptr], %[prefix], lsr #3] \n\t"
-+#endif
-+ "lsl %[n1], %[n1], #1 \n\t"
-+ "lsr %[rice], %[n1], %[rice] \n\t"
-+ "rsb %[n2], %[n2], #34 \n\t"
-+ "mul %[range], %[range], %[rice] \n\t"
-+ "pop {%[rice]} \n\t"
-+ "rev %[ptr], %[ptr] \n\t"
-+ "orr %[n1], %[n1], #0x80000000 \n\t"
-+ "strh %[prefix], [%[cc], %[by22_bits_off]] \n\t"
-+ "mov %[prefix], #2 \n\t"
-+ "lsl %[range], %[range], #23 \n\t"
-+#if CONFIG_THUMB
-+ "lsl %[remain], %[tmp2] \n\t"
-+ "rsb %[range], %[remain] \n\t"
-+#else
-+ "rsb %[range], %[range], %[remain], lsl %[tmp2] \n\t"
-+#endif
-+ "lsl %[remain], %[prefix], %[rice] \n\t"
-+#if CONFIG_THUMB
-+ "lsr %[n1], %[n2] \n\t"
-+ "add %[remain], %[n1] \n\t"
-+#else
-+ "add %[remain], %[remain], %[n1], lsr %[n2] \n\t"
-+#endif
-+ "3: \n\t"
-+ "lsl %[ptr], %[ptr], %[tmp1] \n\t"
-+ "orr %[range], %[range], %[ptr], lsr #9 \n\t"
-+ "4: \n\t"
-+ "str %[range], [%[cc], %[low_off]] \n\t"
-+ : // Outputs
-+ [remain]"=&r"(last_coeff_abs_level_remaining),
-+ [rice]"+r"(rice_param),
-+ [prefix]"=&r"(prefix),
-+ [n1]"=&r"(n1),
-+ [range]"=&r"(range),
-+ [n2]"=&r"(n2),
-+ [ptr]"=&r"(ptr),
-+ [tmp1]"=&r"(tmp1),
-+ [tmp2]"=&r"(tmp2)
-+ : // Inputs
-+ [cc]"r"(c),
-+ [peek_bits_plus_2]"I"(CABAC_BY22_PEEK_BITS + 2),
-+ [low_off]"J"(offsetof(CABACContext, low)),
-+ [range_off]"J"(offsetof(CABACContext, range)),
-+ [by22_bits_off]"J"(offsetof(CABACContext, by22.bits)),
-+ [by22_range_off]"J"(offsetof(CABACContext, by22.range)),
-+ [ptr_off]"J"(offsetof(CABACContext, bytestream))
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ return last_coeff_abs_level_remaining;
-+}
-+
-+#endif /* HAVE_ARMV6T2_INLINE */
-+
-+#endif /* AVCODEC_ARM_HEVC_CABAC_H */
-diff --git a/libavcodec/arm/rpi_hevc_idct_fn_neon.S b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-new file mode 100644
-index 0000000000..0211e447a8
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_idct_fn_neon.S
-@@ -0,0 +1,161 @@
-+@ Included multiple times from hevc_idct_neon.S
-+@ Macros defined there
-+
-+#define DC_SHIFT (15 - BIT_DEPTH)
-+#define DC_ADD (1 | (1 << (14 - BIT_DEPTH)))
-+#define TRN_SHIFT (20 - BIT_DEPTH)
-+
-+function JOIN(ff_hevc_rpi_idct_4x4_dc_neon_, BIT_DEPTH), export=1
-+ ldrsh r1, [r0]
-+ add r1, #DC_ADD
-+ asr r1, #DC_SHIFT
-+ vdup.16 q0, r1
-+ vdup.16 q1, r1
-+ vst1.16 {q0, q1}, [r0]
-+ bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_8x8_dc_neon_, BIT_DEPTH), export=1
-+ ldrsh r1, [r0]
-+ add r2, r0, #32
-+ mov r3, #64
-+ add r1, #DC_ADD
-+ asr r1, #DC_SHIFT
-+ vdup.16 q8, r1
-+ vdup.16 q9, r1
-+ vst1.16 {q8, q9}, [r0], r3
-+ vst1.16 {q8, q9}, [r2], r3
-+ vst1.16 {q8, q9}, [r0]
-+ vst1.16 {q8, q9}, [r2]
-+ bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_16x16_dc_neon_, BIT_DEPTH), export=1
-+ ldrsh r1, [r0]
-+ add r2, r0, #32
-+ mov r3, #64
-+ add r1, #DC_ADD
-+ mov ip, #16*16
-+ asr r1, #DC_SHIFT
-+ vdup.16 q8, r1
-+ vdup.16 q9, r1
-+1: vst1.16 {q8, q9}, [r0], r3
-+ subs ip, ip, #32
-+ vst1.16 {q8, q9}, [r2], r3
-+ bhi 1b
-+ bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_idct_32x32_dc_neon_, BIT_DEPTH), export=1
-+ ldrsh r1, [r0]
-+ add r2, r0, #32
-+ mov r3, #64
-+ add r1, #DC_ADD
-+ mov ip, #32*32
-+ asr r1, #DC_SHIFT
-+ vdup.16 q8, r1
-+ vdup.16 q9, r1
-+1: vst1.16 {q8, q9}, [r0], r3
-+ subs ip, ip, #32
-+ vst1.16 {q8, q9}, [r2], r3
-+ bhi 1b
-+ bx lr
-+endfunc
-+
-+
-+function JOIN(ff_hevc_rpi_transform_4x4_neon_, BIT_DEPTH), export=1
-+ vldr.i32 s0, =0x00240053 // 36 and 83
-+ vld1.16 {q14, q15}, [r0 :256] // coeffs
-+
-+ tr4_shift #7
-+
-+ vzip.16 d28, d29
-+ vzip.16 d30, d31
-+ vzip.32 q14, q15
-+
-+ tr4_shift #TRN_SHIFT
-+
-+ vst4.16 {q14, q15}, [r0 :256]
-+ bx lr
-+
-+ .ltorg
-+endfunc
-+
-+
-+
-+function JOIN(ff_hevc_rpi_transform_luma_4x4_neon_, BIT_DEPTH), export=1
-+ vmov.i32 d0, #0x4a // 74
-+ vld1.16 {q14, q15}, [r0 :256] // coeffs
-+ vmov.i32 d1, #0x1d // 29
-+ vmov.i32 d2, #0x37 // 55
-+
-+ tr4_luma_shift #7
-+
-+ vzip.16 d28, d29
-+ vzip.16 d30, d31
-+ vzip.32 q14, q15
-+
-+ tr4_luma_shift #TRN_SHIFT
-+
-+ vst4.16 {q14, q15}, [r0 :256]
-+ bx lr
-+endfunc
-+
-+function JOIN(ff_hevc_rpi_transform_8x8_neon_, BIT_DEPTH), export=1
-+ add r2, r0, #16
-+ adr r3, tr4f
-+ vpush {d8-d15}
-+ vld1.16 {d0, d1}, [r3]
-+ mov r3, #32
-+
-+ tr8_vert d16, d17, d18, d19, d24, d25, d26, d27, q8, q9, \
-+ "sub r0, r0, #128-8", \
-+ "sub r2, r2, #128-8", \
-+ "cmp r1, #4"
-+ ble 2f
-+
-+ tr8_vert d20, d21, d22, d23, d28, d29, d30, d31, q10, q11, \
-+ "sub r0, r0, #128+8", \
-+ "sub r2, r2, #128+8+16-32", \
-+ "mov r3, #64"
-+
-+ vzip.16 d16, d17
-+ vzip.16 d18, d19
-+
-+ vzip.16 d20, d21
-+ vzip.16 d22, d23
-+ vzip.16 d28, d29
-+ vzip.16 d30, d31
-+ vzip.32 q10, q11
-+ vzip.32 q14, q15
-+1:
-+ vzip.16 d24, d25
-+ vzip.16 d26, d27
-+ vzip.32 q8, q9
-+ vzip.32 q12, q13
-+
-+ tr8_horiz d16, d17, d18, d19, d20, d21, d22, d23, q8, q9, TRN_SHIFT
-+ tr8_horiz d24, d25, d26, d27, d28, d29, d30, d31, q12, q13, TRN_SHIFT
-+
-+ vpop {d8-d15}
-+ bx lr
-+
-+2: vmov.i64 q10, #0
-+ sub r0, r0, #8
-+ vmov.i64 q11, #0
-+ sub r2, r2, #8+16-32
-+ vmov.i64 q14, #0
-+ mov r3, #64
-+ vmov.i64 q15, #0
-+
-+ vzip.16 d16, d17
-+ vzip.16 d18, d19
-+
-+ b 1b
-+
-+endfunc
-+
-+#undef DC_SHIFT
-+#undef DC_ADD
-+#undef TRN_SHIFT
-+
-diff --git a/libavcodec/arm/rpi_hevc_misc_neon.S b/libavcodec/arm/rpi_hevc_misc_neon.S
-new file mode 100644
-index 0000000000..200eac416e
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.S
-@@ -0,0 +1,238 @@
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ rpi_zap_coeff_vals_neon(
-+@ uint16_t * buf, [r0]
-+@ unsigned int log_n_m2) [r1]
-+
-+function rpi_zap_coeff_vals_neon, export=1
-+ mov ip, #1
-+ vmov.i64 q0, #0
-+ teq r1, #0
-+ vmov.i64 q1, #0
-+ beq 2f
-+
-+ lsl ip, r1 @ 2, 4 or 8
-+ add r2, r0, #32
-+ lsl ip, r1 @ 4, 16 or 64 = number of 32-byte blocks to zero
-+ mov r3, #64
-+1: vst1.8 {q0,q1}, [r0:256], r3
-+ subs ip, #2
-+ vst1.8 {q0,q1}, [r2:256], r3
-+ bne 1b
-+ bx lr
-+
-+2: vst1.8 {q0,q1}, [r0:256]
-+ bx lr
-+endfunc
-+
-+@ PIC jump tables are more expensive than absolute for A32 code
-+.set jent_pic, CONFIG_PIC || CONFIG_THUMB
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+T .short ((0 + \lab) - (0 + 98b)) / 2
-+A .short (0 + \lab) - (4 + 98b)
-+.else
-+T .word 1 + \lab
-+A .word \lab
-+.endif
-+.endm
-+
-+.set expected_next, 0
-+
-+.macro cpy_compound val, p1, p2, drop_thru=0
-+.if \p1 + \p2 != \val
-+.error "Bad addition! \p1 + \p2 != \val"
-+.endif
-+.if expected_next != 0 && expected_next != \val
-+.error "Drop thru failure"
-+.endif
-+\val\():
-+ push {r0-r3}
-+ bl 100\p1\()b
-+ pop {r0-r3}
-+ add r0, #\p1
-+ add r2, #\p1
-+.if \drop_thru == 0
-+ b \p2\()b
-+.set expected_next, 0
-+.else
-+.set expected_next, \p2
-+.endif
-+.endm
-+
-+@ ff_hevc_cpy_blks8x4_neon(
-+@ dst [r0]
-+@ dst_stride [r1]
-+@ src [r2]
-+@ src_stride [r3]
-+@ width [sp, #0] (bytes)
-+@ height) [sp, #4]
-+@
-+@ Power of 2 widths are directly coded, all others are done in stripes
-+@ We expect the vast majority of calls to be power of 2
-+@
-+@ Currently has min width of 8, but we could make that 4 without issue
-+@ Min height is 4
-+
-+function ff_hevc_rpi_cpy_blks8x4_neon, export=1
-+ ldr r12, [sp, #0]
-+ push {r11, lr}
-+.if jent_pic
-+A adr lr, 98f - 2
-+.else
-+A adr lr, 98f - 4
-+.endif
-+ lsr r12, #3
-+ ldr r11, [sp, #(8 + 4)]
-+.if jent_pic
-+A lsl r12, #1
-+A ldrsh lr, [lr, r12]
-+A add pc, lr
-+T tbh [pc, r12, lsl #1]
-+.else
-+ @ A32 only, Thumb is always PIC
-+ ldr pc, [lr, r12, lsl #2]
-+.endif
-+
-+98:
-+T .short 0 @ unused
-+ jent 8f
-+ jent 16f
-+ jent 24f
-+ jent 32f
-+ jent 40f
-+ jent 48f
-+ jent 56f
-+ jent 64f
-+ jent 72f
-+ jent 80f
-+ jent 88f
-+ jent 96f
-+ jent 104f
-+ jent 112f
-+ jent 120f
-+ jent 128f
-+
-+1008:
-+ push {r11, lr}
-+8:
-+ add lr, r2, r3
-+ lsl r3, #1
-+ add r12, r0, r1
-+ lsl r1, #1
-+1:
-+ vld1.32 {d0 }, [r2], r3
-+ vld1.32 {d1 }, [lr], r3
-+ vld1.32 {d2 }, [r2], r3
-+ vld1.32 {d3 }, [lr], r3
-+ subs r11, #4
-+ vst1.32 {d0 }, [r0], r1
-+ vst1.32 {d1 }, [r12], r1
-+ vst1.32 {d2 }, [r0], r1
-+ vst1.32 {d3 }, [r12], r1
-+ bgt 1b
-+ pop {r11, pc}
-+
-+10016:
-+ push {r11, lr}
-+16:
-+ add lr, r2, r3
-+ lsl r3, #1
-+ add r12, r0, r1
-+ lsl r1, #1
-+1:
-+ vld1.32 {q0 }, [r2], r3
-+ vld1.32 {q1 }, [lr], r3
-+ vld1.32 {q2 }, [r2], r3
-+ vld1.32 {q3 }, [lr], r3
-+ subs r11, #4
-+ vst1.32 {q0 }, [r0], r1
-+ vst1.32 {q1 }, [r12], r1
-+ vst1.32 {q2 }, [r0], r1
-+ vst1.32 {q3 }, [r12], r1
-+ bgt 1b
-+ pop {r11, pc}
-+
-+10032:
-+ push {r11, lr}
-+32:
-+ add lr, r2, r3
-+ lsl r3, #1
-+ add r12, r0, r1
-+ lsl r1, #1
-+1:
-+ vld1.32 {q8, q9 }, [r2], r3
-+ vld1.32 {q10, q11}, [lr], r3
-+ vld1.32 {q12, q13}, [r2], r3
-+ vld1.32 {q14, q15}, [lr], r3
-+ subs r11, #4
-+ vst1.32 {q8, q9 }, [r0], r1
-+ vst1.32 {q10, q11}, [r12], r1
-+ vst1.32 {q12, q13}, [r0], r1
-+ vst1.32 {q14, q15}, [r12], r1
-+ bgt 1b
-+ pop {r11, pc}
-+
-+10064:
-+ push {r11, lr}
-+64:
-+ add lr, r2, #32
-+ add r12, r0, #32
-+1:
-+ vld1.32 {q8, q9 }, [r2], r3
-+ vld1.32 {q10, q11}, [lr], r3
-+ vld1.32 {q12, q13}, [r2], r3
-+ vld1.32 {q14, q15}, [lr], r3
-+ subs r11, #2
-+ vst1.32 {q8, q9 }, [r0], r1
-+ vst1.32 {q10, q11}, [r12], r1
-+ vst1.32 {q12, q13}, [r0], r1
-+ vst1.32 {q14, q15}, [r12], r1
-+ bgt 1b
-+ pop {r11, pc}
-+
-+128:
-+ push {r4, r5}
-+ @ We could do this with fewer registers if we jump around but I
-+ @ have a primative urge to load sequentially
-+ mov r4, #64
-+ add lr, r2, #32
-+ add r12, r0, #32
-+ sub r3, r4
-+ sub r1, r4
-+1:
-+ vld1.32 {q8, q9 }, [r2], r4
-+ vld1.32 {q10, q11}, [lr], r4
-+ vld1.32 {q12, q13}, [r2], r3
-+ vld1.32 {q14, q15}, [lr], r3
-+ subs r11, #1
-+ vst1.32 {q8, q9 }, [r0], r4
-+ vst1.32 {q10, q11}, [r12], r4
-+ vst1.32 {q12, q13}, [r0], r1
-+ vst1.32 {q14, q15}, [r12], r1
-+ bgt 1b
-+ pop {r4, r5, r11, pc}
-+
-+@ Use drop_thru where we can
-+cpy_compound 104, 64, 40, 1
-+cpy_compound 40, 32, 8
-+
-+cpy_compound 112, 64, 48, 1
-+cpy_compound 48, 32, 16
-+
-+cpy_compound 120, 64, 56, 1
-+cpy_compound 56, 32, 24, 1
-+cpy_compound 24, 16, 8
-+
-+cpy_compound 72, 64, 8
-+cpy_compound 80, 64, 16
-+cpy_compound 88, 64, 24
-+cpy_compound 96, 64, 32
-+
-+
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevc_misc_neon.h b/libavcodec/arm/rpi_hevc_misc_neon.h
-new file mode 100644
-index 0000000000..9d21f6a882
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_misc_neon.h
-@@ -0,0 +1,438 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_RPI_HEVC_MISC_H
-+#define AVCODEC_ARM_RPI_HEVC_MISC_H
-+
-+#include "config.h"
-+#if HAVE_NEON_INLINE && !CONFIG_THUMB
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2h_neon(uint8_t *dst, const uint8_t *src,
-+ int pixel_shift, int height,
-+ ptrdiff_t stride_src)
-+{
-+ const uint8_t *src2 = src + stride_src;
-+ stride_src <<= 1;
-+ switch (pixel_shift)
-+ {
-+ case 2:
-+ __asm__ volatile (
-+ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+ "beq 2f \n\t"
-+ "1: \n\t"
-+ "vld1.32 {d2[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.32 {d2[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.32 {d3[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.32 {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.32 {q0}, [%[dst]]! \n\t"
-+ "beq 3f \n\t"
-+ "vld1.32 {d0[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.32 {d0[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.32 {d1[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.32 {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.32 {q1}, [%[dst]]! \n\t"
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ "vst1.32 {q0}, [%[dst]] \n\t"
-+ "b 4f \n\t"
-+ "3: \n\t"
-+ "vst1.32 {q1}, [%[dst]] \n\t"
-+ "4: \n\t"
-+ : // Outputs
-+ [src]"+r"(src),
-+ [src2]"+r"(src2),
-+ [dst]"+r"(dst),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_src]"r"(stride_src)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ case 1:
-+ __asm__ volatile (
-+ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+ "beq 2f \n\t"
-+ "1: \n\t"
-+ "vld1.16 {d2[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.16 {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.16 {d2[1]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.16 {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vzip.16 d0, d1 \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.16 {d0}, [%[dst]]! \n\t"
-+ "beq 3f \n\t"
-+ "vld1.16 {d0[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.16 {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.16 {d0[1]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.16 {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vzip.16 d2, d3 \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.16 {d2}, [%[dst]]! \n\t"
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ "vzip.16 d0, d1 \n\t"
-+ "vst1.16 {d0}, [%[dst]] \n\t"
-+ "b 4f \n\t"
-+ "3: \n\t"
-+ "vzip.16 d2, d3 \n\t"
-+ "vst1.16 {d2}, [%[dst]] \n\t"
-+ "4: \n\t"
-+ : // Outputs
-+ [src]"+r"(src),
-+ [src2]"+r"(src2),
-+ [dst]"+r"(dst),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_src]"r"(stride_src)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ default:
-+ __asm__ volatile (
-+ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t"
-+ "subs %[height], #8 \n\t"
-+ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+ "beq 2f \n\t"
-+ "1: \n\t"
-+ "vld1.8 {d2[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d3[0]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d2[1]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d3[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d2[2]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d3[2]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d2[3]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d3[3]}, [%[src2]], %[stride_src] \n\t"
-+ "vzip.8 d0, d1 \n\t"
-+ "subs %[height], #8 \n\t"
-+ "vst1.8 {d0}, [%[dst]]! \n\t"
-+ "beq 3f \n\t"
-+ "vld1.8 {d0[0]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[0]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d0[1]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[1]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d0[2]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[2]}, [%[src2]], %[stride_src] \n\t"
-+ "vld1.8 {d0[3]}, [%[src]], %[stride_src] \n\t"
-+ "vld1.8 {d1[3]}, [%[src2]], %[stride_src] \n\t"
-+ "vzip.8 d2, d3 \n\t"
-+ "subs %[height], #8 \n\t"
-+ "vst1.8 {d2}, [%[dst]]! \n\t"
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ "vzip.8 d0, d1 \n\t"
-+ "vst1.8 {d0}, [%[dst]] \n\t"
-+ "b 4f \n\t"
-+ "3: \n\t"
-+ "vzip.8 d2, d3 \n\t"
-+ "vst1.8 {d2}, [%[dst]] \n\t"
-+ "4: \n\t"
-+ : // Outputs
-+ [src]"+r"(src),
-+ [src2]"+r"(src2),
-+ [dst]"+r"(dst),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_src]"r"(stride_src)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_h2v_neon(uint8_t *dst, const uint8_t *src,
-+ int pixel_shift, int height,
-+ ptrdiff_t stride_dst)
-+{
-+ uint8_t *dst2 = dst + stride_dst;
-+ stride_dst <<= 1;
-+ switch (pixel_shift)
-+ {
-+ case 2:
-+ __asm__ volatile (
-+ "subs %[height], #4 \n\t"
-+ "vld1.32 {q0}, [%[src]]! \n\t"
-+ "beq 2f \n\t"
-+ "1: \n\t"
-+ "vld1.32 {q1}, [%[src]]! \n\t"
-+ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.32 {d1[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.32 {d1[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "beq 3f \n\t"
-+ "vld1.32 {q0}, [%[src]]! \n\t"
-+ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.32 {d3[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.32 {d3[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ "vst1.32 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.32 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.32 {d1[0]}, [%[dst]] \n\t"
-+ "vst1.32 {d1[1]}, [%[dst2]] \n\t"
-+ "b 4f \n\t"
-+ "3: \n\t"
-+ "vst1.32 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.32 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.32 {d3[0]}, [%[dst]] \n\t"
-+ "vst1.32 {d3[1]}, [%[dst2]] \n\t"
-+ "4: \n\t"
-+ : // Outputs
-+ [dst]"+r"(dst),
-+ [dst2]"+r"(dst2),
-+ [src]"+r"(src),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_dst]"r"(stride_dst)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ case 1:
-+ __asm__ volatile (
-+ "subs %[height], #4 \n\t"
-+ "vld1.16 {d0}, [%[src]]! \n\t"
-+ "beq 2f \n\t"
-+ "1: \n\t"
-+ "vld1.16 {d2}, [%[src]]! \n\t"
-+ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.16 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.16 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+ "beq 3f \n\t"
-+ "vld1.16 {d0}, [%[src]]! \n\t"
-+ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.16 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
-+ "subs %[height], #4 \n\t"
-+ "vst1.16 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ "vst1.16 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.16 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.16 {d0[2]}, [%[dst]] \n\t"
-+ "vst1.16 {d0[3]}, [%[dst2]] \n\t"
-+ "b 4f \n\t"
-+ "3: \n\t"
-+ "vst1.16 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.16 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.16 {d2[2]}, [%[dst]] \n\t"
-+ "vst1.16 {d2[3]}, [%[dst2]] \n\t"
-+ "4: \n\t"
-+ : // Outputs
-+ [dst]"+r"(dst),
-+ [dst2]"+r"(dst2),
-+ [src]"+r"(src),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_dst]"r"(stride_dst)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ default:
-+ __asm__ volatile (
-+ "subs %[height], #8 \n\t"
-+ "vld1.8 {d0}, [%[src]]! \n\t"
-+ "beq 2f \n\t"
-+ "1: \n\t"
-+ "vld1.8 {d2}, [%[src]]! \n\t"
-+ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[6]}, [%[dst]], %[stride_dst] \n\t"
-+ "subs %[height], #8 \n\t"
-+ "vst1.8 {d0[7]}, [%[dst2]], %[stride_dst] \n\t"
-+ "beq 3f \n\t"
-+ "vld1.8 {d0}, [%[src]]! \n\t"
-+ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[6]}, [%[dst]], %[stride_dst] \n\t"
-+ "subs %[height], #8 \n\t"
-+ "vst1.8 {d2[7]}, [%[dst2]], %[stride_dst] \n\t"
-+ "bne 1b \n\t"
-+ "2: \n\t"
-+ "vst1.8 {d0[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[2]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[3]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[4]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[5]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d0[6]}, [%[dst]] \n\t"
-+ "vst1.8 {d0[7]}, [%[dst2]] \n\t"
-+ "b 4f \n\t"
-+ "3: \n\t"
-+ "vst1.8 {d2[0]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[1]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[2]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[3]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[4]}, [%[dst]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[5]}, [%[dst2]], %[stride_dst] \n\t"
-+ "vst1.8 {d2[6]}, [%[dst]] \n\t"
-+ "vst1.8 {d2[7]}, [%[dst2]] \n\t"
-+ "4: \n\t"
-+ : // Outputs
-+ [dst]"+r"(dst),
-+ [dst2]"+r"(dst2),
-+ [src]"+r"(src),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_dst]"r"(stride_dst)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ }
-+}
-+
-+static av_noinline void ff_hevc_rpi_copy_vert_v2v_neon(uint8_t *dst, const uint8_t *src,
-+ int pixel_shift, int height,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+ int x, y;
-+ switch (pixel_shift)
-+ {
-+ case 2:
-+ __asm__ volatile (
-+ "ldr %[x], [%[src]], %[stride_src] \n\t"
-+ "ldr %[y], [%[src]], %[stride_src] \n\t"
-+ "str %[x], [%[dst]], %[stride_dst] \n\t"
-+ "sub %[height], #2 \n\t"
-+ "1: \n\t"
-+ "ldr %[x], [%[src]], %[stride_src] \n\t"
-+ "str %[y], [%[dst]], %[stride_dst] \n\t"
-+ "ldr %[y], [%[src]], %[stride_src] \n\t"
-+ "subs %[height], #2 \n\t"
-+ "str %[x], [%[dst]], %[stride_dst] \n\t"
-+ "bne 1b \n\t"
-+ "str %[y], [%[dst]] \n\t"
-+ : // Outputs
-+ [x]"=&r"(x),
-+ [y]"=&r"(y),
-+ [src]"+r"(src),
-+ [dst]"+r"(dst),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_src]"r"(stride_src),
-+ [stride_dst]"r"(stride_dst)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ case 1:
-+ __asm__ volatile (
-+ "ldrh %[x], [%[src]], %[stride_src] \n\t"
-+ "ldrh %[y], [%[src]], %[stride_src] \n\t"
-+ "strh %[x], [%[dst]], %[stride_dst] \n\t"
-+ "sub %[height], #2 \n\t"
-+ "1: \n\t"
-+ "ldrh %[x], [%[src]], %[stride_src] \n\t"
-+ "strh %[y], [%[dst]], %[stride_dst] \n\t"
-+ "ldrh %[y], [%[src]], %[stride_src] \n\t"
-+ "subs %[height], #2 \n\t"
-+ "strh %[x], [%[dst]], %[stride_dst] \n\t"
-+ "bne 1b \n\t"
-+ "strh %[y], [%[dst]] \n\t"
-+ : // Outputs
-+ [x]"=&r"(x),
-+ [y]"=&r"(y),
-+ [src]"+r"(src),
-+ [dst]"+r"(dst),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_src]"r"(stride_src),
-+ [stride_dst]"r"(stride_dst)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ default:
-+ __asm__ volatile (
-+ "ldrb %[x], [%[src]], %[stride_src] \n\t"
-+ "ldrb %[y], [%[src]], %[stride_src] \n\t"
-+ "strb %[x], [%[dst]], %[stride_dst] \n\t"
-+ "sub %[height], #2 \n\t"
-+ "1: \n\t"
-+ "ldrb %[x], [%[src]], %[stride_src] \n\t"
-+ "strb %[y], [%[dst]], %[stride_dst] \n\t"
-+ "ldrb %[y], [%[src]], %[stride_src] \n\t"
-+ "subs %[height], #2 \n\t"
-+ "strb %[x], [%[dst]], %[stride_dst] \n\t"
-+ "bne 1b \n\t"
-+ "strb %[y], [%[dst]] \n\t"
-+ : // Outputs
-+ [x]"=&r"(x),
-+ [y]"=&r"(y),
-+ [src]"+r"(src),
-+ [dst]"+r"(dst),
-+ [height]"+r"(height)
-+ : // Inputs
-+ [stride_src]"r"(stride_src),
-+ [stride_dst]"r"(stride_dst)
-+ : // Clobbers
-+ "cc", "memory"
-+ );
-+ break;
-+ }
-+}
-+
-+#define ff_hevc_rpi_copy_vert ff_hevc_rpi_copy_vert_neon
-+static inline void ff_hevc_rpi_copy_vert_neon(uint8_t *dst, const uint8_t *src,
-+ int pixel_shift, int height,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+ if (stride_dst == 1 << pixel_shift)
-+ ff_hevc_rpi_copy_vert_v2h_neon(dst, src, pixel_shift, height, stride_src);
-+ else if (stride_src == 1 << pixel_shift)
-+ ff_hevc_rpi_copy_vert_h2v_neon(dst, src, pixel_shift, height, stride_dst);
-+ else
-+ ff_hevc_rpi_copy_vert_v2v_neon(dst, src, pixel_shift, height, stride_dst, stride_src);
-+}
-+
-+#endif /* HAVE_NEON_INLINE */
-+
-+#endif /* AVCODEC_ARM_RPI_HEVC_MISC_H */
-diff --git a/libavcodec/arm/rpi_hevc_mv_arm.h b/libavcodec/arm/rpi_hevc_mv_arm.h
-new file mode 100644
-index 0000000000..c73de55a48
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevc_mv_arm.h
-@@ -0,0 +1,64 @@
-+#ifndef AVCODEC_ARM_RPI_HEVC_MV_H
-+#define AVCODEC_ARM_RPI_HEVC_MV_H
-+
-+#if HAVE_ARMV6T2_INLINE
-+static inline MvXY mvxy_add_arm(const MvXY a, const MvXY b)
-+{
-+ MvXY r;
-+ __asm__ (
-+ "sadd16 %[r], %[a], %[b] \n\t"
-+ : [r]"=r"(r)
-+ : [a]"r"(a),
-+ [b]"r"(b)
-+ :
-+ );
-+ return r;
-+}
-+#define mvxy_add mvxy_add_arm
-+#endif
-+
-+#if HAVE_ARMV6T2_INLINE
-+#if (defined(__ARM_ARCH_EXT_IDIV__) || defined (__ARM_FEATURE_IDIV))
-+static inline int32_t mv_scale_xy_arm(int32_t xy, int td, int tb)
-+{
-+ int t;
-+ __asm__ (
-+ "ssat %[td], #8, %[td] \n\t"
-+ "ssat %[tb], #8, %[tb] \n\t"
-+ "eor %[t], %[td], %[td], asr #31 \n\t"
-+ "adds %[t], %[t], %[td], lsr #31 \n\t"
-+ "asr %[t], #1 \n\t"
-+ "add %[t], #0x4000 \n\t"
-+ "it ne \n\t"
-+ "sdivne %[t], %[t], %[td] \n\t"
-+ "mov %[td], #32 \n\t"
-+ "smlabb %[td], %[t], %[tb], %[td] \n\t"
-+ "ssat %[td], #13, %[td], asr #6 \n\t"
-+ "mov %[tb], #127 \n\t"
-+ "smlatb %[t], %[xy], %[td], %[tb] \n\t"
-+ "smlabb %[tb], %[xy], %[td], %[tb] \n\t"
-+// This takes the sign of x & y for rounding at the "wrong" point
-+// (i.e. after adding 127) but for the range of values (-1,-127)
-+// where it does the wrong thing you get the right answer (0) anyway
-+ "add %[t], %[t], %[t], lsr #31 \n\t"
-+ "add %[xy], %[tb], %[tb], lsr #31 \n\t"
-+ "ssat %[t], #16, %[t], asr #8 \n\t"
-+ "ssat %[xy], #16, %[xy], asr #8 \n\t"
-+ "pkhbt %[xy], %[xy], %[t], lsl #16 \n\t"
-+ :
-+ [t]"=&r"(t),
-+ [xy]"+r"(xy),
-+ [td]"+r"(td),
-+ [tb]"+r"(tb)
-+ :
-+ :
-+ "cc"
-+ );
-+ return xy;
-+}
-+#define mv_scale_xy mv_scale_xy_arm
-+#endif
-+#endif
-+
-+#endif // AVCODEC_ARM_RPI_HEVC_MV_H
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_arm.h b/libavcodec/arm/rpi_hevcdsp_arm.h
-new file mode 100644
-index 0000000000..62b9326532
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_arm.h
-@@ -0,0 +1,26 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCDSP_ARM_H
-+#define AVCODEC_ARM_HEVCDSP_ARM_H
-+
-+#include "libavcodec/rpi_hevcdsp.h"
-+
-+void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCDSP_ARM_H */
-diff --git a/libavcodec/arm/rpi_hevcdsp_deblock_neon.S b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-new file mode 100644
-index 0000000000..18a76a4112
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_deblock_neon.S
-@@ -0,0 +1,1633 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.macro hevc_loop_filter_uv_body1 P1a, P0a, Q0a, Q1a, I1, I2, I3, I4, I5, I6, I7, I8
-+ vsubl.u8 q0, \Q0a, \P0a
-+ vsubl.u8 q1, \P1a, \Q1a
-+ vdup.16 d4, r2
-+ \I1
-+ vshl.i16 q0, #2
-+ \I2
-+ vadd.i16 q0, q1
-+ \I3
-+ vmovl.u8 q2, d4
-+ \I4
-+ vneg.s16 q1, q2
-+ \I5
-+ vrshr.s16 q0, #3
-+ \I6
-+ \I7
-+ \I8
-+ vmin.s16 q0, q2
-+ vmovl.u8 q2, \Q0a
-+ vmax.s16 q0, q1
-+ vaddw.u8 q1, q0, \P0a
-+ vsub.i16 q0, q2, q0
-+ vqmovun.s16 \P0a, q1
-+ vqmovun.s16 \Q0a, q0
-+.endm
-+
-+
-+.macro hevc_loop_filter_uv_body2 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, I1, I2, I3, I4, I5, I6, I7
-+ vsubl.u8 q0, \Q0a, \P0a @ q0a - p0a
-+ lsr r12, r2, #16
-+ vsubl.u8 q1, \Q0b, \P0b @ q0b - p0b
-+ vsubl.u8 q2, \P1a, \Q1a @ p1a - q1a
-+ vsubl.u8 q3, \P1b, \Q1b @ p1b - q1b
-+ vshl.i16 q0, #2 @ (q0a - p0a) * 4
-+ vshl.i16 q1, #2 @ (q0b - p0b) * 4
-+ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
-+ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
-+ vdup.16 d4, r2 @ tc0a, tc0b
-+ vdup.16 d6, r12 @ tc1a, tc1b
-+ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+ \I1
-+ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+ \I2
-+ vmovl.u8 q2, d4 @ tc0a, tc0b
-+ \I3
-+ vmovl.u8 q3, d6 @ tc1a, tc1b
-+ \I4
-+ vmin.s16 q0, q2
-+ \I5
-+ vneg.s16 q2, q2 @ -tc0a, -tc0b
-+ \I6
-+ vmin.s16 q1, q3
-+ \I7
-+ vneg.s16 q3, q3 @ -tc1a, -tc1b
-+ vmax.s16 q0, q2 @ delta0a
-+ vmovl.u8 q2, \Q0a
-+ vmax.s16 q1, q3 @ delta0b
-+ vaddw.u8 q3, q0, \P0a @ p0a + delta0a
-+ vsub.i16 q0, q2, q0 @ q0a - delta0a
-+ vmovl.u8 q2, \Q0b
-+ vsub.i16 q2, q1 @ q0b - delta0b
-+ vaddw.u8 q1, \P0b @ p0b + delta0b
-+ vqmovun.s16 \Q0a, q0
-+ vqmovun.s16 \P0a, q3
-+ vqmovun.s16 \Q0b, q2
-+ vqmovun.s16 \P0b, q1
-+.endm
-+
-+
-+@ Preserves r12
-+@ Clobbers r2
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@ [0..7] tc U a
-+@ [8..15] tc V a
-+
-+.macro hevc_loop_filter_uv_body1_16 P1a, P0a, Q0a, Q1a, bit_depth, I1, I2, I3, I4, I5, I6, I7, I8
-+ vsub.i16 q0, \Q0a, \P0a
-+ vsub.i16 q1, \P1a, \Q1a
-+ vdup.16 d4, r2
-+ \I1
-+ vshl.i16 q0, #2
-+ \I2
-+ vadd.i16 q0, q1
-+ \I3
-+ vshll.u8 q2, d4, #\bit_depth - 8
-+ \I4
-+ vneg.s16 q1, q2
-+ \I5
-+ vrshr.s16 q0, #3
-+ \I6
-+ \I7
-+ \I8
-+ vmin.s16 q0, q2
-+ vmov.i16 q2, #0
-+ vmax.s16 q0, q1
-+ vadd.i16 \P0a, q0
-+ vsub.i16 \Q0a, q0
-+ vmov.i16 q1, #(1 << \bit_depth) - 1
-+ vmax.s16 \P0a, q2
-+ vmax.s16 \Q0a, q2
-+ vmin.s16 \P0a, q1
-+ vmin.s16 \Q0a, q1
-+.endm
-+
-+@ Clobbers r2, r12
-+@ P0a et al all contain UVUVUVUV
-+@ r2 (tc4) contains
-+@ [0..7] tc U a
-+@ [8..15] tc V a
-+@ [16..23] tc U b
-+@ [24..31] tc V b
-+
-+.macro hevc_loop_filter_uv_body2_16 P1a, P1b, P0a, P0b, Q0a, Q0b, Q1a, Q1b, bit_depth, I1, I2, I3, I4, I5, I6, I7
-+ vsub.i16 q0, \Q0a, \P0a @ q0a - p0a
-+ lsr r12, r2, #16
-+ vsub.i16 q1, \Q0b, \P0b @ q0b - p0b
-+ vsub.i16 q2, \P1a, \Q1a @ p1a - q1a
-+ vsub.i16 q3, \P1b, \Q1b @ p1b - q1b
-+ vshl.i16 q0, #2 @ (q0a - p0a) * 4
-+ vshl.i16 q1, #2 @ (q0b - p0b) * 4
-+ vadd.i16 q0, q2 @ ((q0a - p0a) * 4) + p1a - q1a
-+ vadd.i16 q1, q3 @ ((q0b - p0b) * 4) + p1b - q1b
-+ vdup.16 d4, r2 @ tc0a, tc0b
-+ vdup.16 d6, r12 @ tc1a, tc1b
-+ vrshr.s16 q0, #3 @ (((q0a - p0a) * 4) + p1a - q1a + 4) >> 3
-+ \I1
-+ vrshr.s16 q1, #3 @ (((q0b - p0b) * 4) + p1b - q1b + 4) >> 3
-+ \I2
-+ vshll.u8 q2, d4, #\bit_depth - 8 @ tc0a, tc0b
-+ \I3
-+ vshll.u8 q3, d6, #\bit_depth - 8 @ tc1a, tc1b
-+ \I4
-+ vmin.s16 q0, q2
-+ \I5
-+ vneg.s16 q2, q2 @ -tc0a, -tc0b
-+ \I6
-+ vmin.s16 q1, q3
-+ \I7
-+ vneg.s16 q3, q3 @ -tc1a, -tc1b
-+ vmax.s16 q0, q2 @ delta0a
-+ vadd.i16 \P0a, q0 @ p0a + delta0a
-+ vsub.i16 \Q0a, q0 @ q0a - delta0a
-+ vmax.s16 q1, q3 @ delta0b
-+ vadd.i16 \P0b, q1 @ p0b + delta0b
-+ vsub.i16 \Q0b, q1 @ q0b - delta0b
-+ vmov.i16 q2, #0
-+ vmov.i16 q3, #(1 << \bit_depth) - 1
-+ vmax.s16 \P0a, q2
-+ vmax.s16 \Q0a, q2
-+ vmax.s16 \P0b, q2
-+ vmax.s16 \Q0b, q2
-+ vmin.s16 \P0a, q3
-+ vmin.s16 \Q0a, q3
-+ vmin.s16 \P0b, q3
-+ vmin.s16 \Q0b, q3
-+.endm
-+
-+
-+
-+@ uint8_t *_no_p, [sp+0]
-+@ uint8_t *_no_q) [sp+4]
-+
-+.macro hevc_loop_filter_luma_start
-+ ldr r12, [r3]
-+ ldr r3, [r3, #4]
-+ orrs r3, r12, r3, lsl #16
-+ it eq
-+ bxeq lr
-+ push {r4-r10,lr} @ 32 bytes
-+ ldrd r4, r5, [sp, #32] @ &_no_p
-+ ldrb r4, [r4]
-+ ldrb r5, [r5]
-+ movs r10, r4
-+ it ne
-+ movne r10, #1
-+ cmp r5, #0
-+ it ne
-+ orrne r10, #2
-+.endm
-+
-+@ Input:
-+@ r2 beta (raw: needs shift for bitdepth > 8)
-+@ r3[ 0:15] tc[0] (raw: needs shift for bitdepth > 8)
-+@ r3[16:31] tc[1] (raw: needs shift for bitdepth > 8)
-+@
-+@ Input & output
-+@ 8-bit: d16-d23 (Q3,Q2,Q1,Q0,P0,P1,P2,P3)
-+@ 16-bit: q8-q15
-+@
-+@ r1 -r1
-+@ r10 b1->C, b0->N (r10 junk)
-+@
-+@ Junks:
-+@ r5, r6, r7, r8, r9
-+
-+.macro m_filter_luma bit_depth, Q11, Q15
-+.if \bit_depth == 8
-+ vmovl.u8 q14, d22 @ q2,7 q2,6 ... q2,0 = TQ2' ... Q2' TQ2 ... Q2
-+ vmovl.u8 q13, d21 @ q1,7 q1,6 ... q1,0 = TQ1' ... Q1' TQ1 ... Q1
-+ vmovl.u8 q12, d20 @ q0,7 q0,6 ... q0,0 = TQ0' ... Q0' TQ0 ... Q0
-+ vmovl.u8 \Q11, d19 @ p0,7 p0,6 ... p0,0 = TP0' ... P0' TP0 ... P0
-+ vmovl.u8 q10, d18 @ p1,7 p1,6 ... p1,0 = TP1' ... P1' TP1 ... P1
-+ vmovl.u8 q9, d17 @ p2,7 p2,6 ... p2,0 = TP2' ... P2' TP2 ... P2
-+.endif
-+ vadd.i16 q0, q9, \Q11 @ P2 + P0
-+.if \bit_depth > 8
-+ lsl r3, r3, #(\bit_depth - 8)
-+.endif
-+ vadd.i16 q1, q14, q12 @ Q2 + Q0
-+.if \bit_depth > 8
-+ lsl r2, r2, #(\bit_depth - 8)
-+.endif
-+ vsub.i16 q0, q10 @ P2 - P1 + P0
-+ lsr r5, r3, #16
-+ vsub.i16 q1, q13 @ Q2 - Q1 + Q0
-+.if \bit_depth == 8
-+ vmovl.u8 q8, d16 @ p3,7 p3,6 ... p3,0 = TP3' ... P3' TP3 ... P3
-+ vmovl.u8 \Q15, d23 @ q3,7 q3,6 ... q3,0 = TQ3' ... Q3' TQ3 ... Q3
-+.endif
-+ vabd.s16 q0, q10 @ dp0 = abs(P2 - 2 * P1 + P0)
-+ vabd.s16 q1, q13 @ dq0 = abs(Q2 - 2 * Q1 + Q0)
-+ vmov.i64 q2, #0xffffffff0000
-+ vbic q0, q2 @ only dp0(') and dp3(')
-+ vbic q1, q2 @ only dq0(') and dq3(')
-+ vsra.u64 q0, #16
-+ vsra.u64 q1, #16
-+ vdup.16 q3, r2 @ beta
-+ vdup.16 d14, r3 @ tC[0]
-+ vdup.16 d15, r5 @ tC[1]
-+ vabd.s16 q4, q8, \Q11 @ abs(TP3'-TP0' ... P3'-P0' TP3-TP0 ... P3-P0)
-+ vmovn.i32 d0, q0 @ dp3' dp0' dp3 dp0
-+ vmovn.i32 d1, q1 @ dq3' dq0' dq3 dq0
-+ vadd.i16 d5, d0, d1 @ d3'=dp3'+dq3' d0'=dp0'+dq0' d3=dp3+dq3 d0=dp0+dq0
-+ vabd.s16 q5, \Q11, q12 @ abs(TP0'-TQ0' ... P0'-Q0' TP0-TQ0 ... P0-Q0)
-+ vaba.s16 q4, \Q15, q12 @ +abs(TQ3'-TQ0' ... Q3'-Q0' TQ3-TQ0 ... Q3-Q0)
-+ vpadd.i16 d2, d5, d5 @ dontcare dontcare d0'+d3' d0+d3
-+ vshl.s16 q6, q7, #2 @ tC[] * 4
-+ vrhadd.s16 q6, q7 @ tc25 = (tc[] * 5 + 1) >> 1
-+ vcgt.s16 d2, d6, d2 @ if (d0 + d3 < beta)
-+ vmov r7, s4 @ (d2) r7 = mask of blocks to apply filtering (16b/block)
-+ vshr.s16 q1, q3, #3 @ beta_3 = beta >> 3
-+ cmp r7, #0
-+ beq .Lbypasswrite
-+
-+ vcgt.s16 q5, q6, q5 @ if < tc25
-+ vcgt.s16 q4, q1, q4 @ if (abs({T}P[0-3]{'}-{T}P[0-3]{'})+abs({T}Q[0-3]{'}-{T}Q[0-3]{'}) < beta_3)
-+ vand q4, q5
-+ vbic d8, d4
-+ vbic d9, d4
-+ vshr.s16 q3, #2 @ beta_2 = beta >> 2
-+ vsra.u64 q4, #16
-+ vshl.s16 d5, #1 @ d3'<<1 d0'<<1 d3<<1 d0<<1
-+ vshl.i16 q7, #1 @ tc2 = tC[] << 1
-+ vcgt.s16 d6, d5 @ if (d3'<<1 < beta_2) etc
-+ vmovn.i32 d8, q4 @ beta_3 && tc25 tests, prime block in ms half
-+ vand d6, d8 @ && beta_2 tests, prime in ms half
-+ vpadd.i16 d0, d1 @ dq0'+dq3' dq0+dq3 dp0'+dp3' dp0+dp3
-+ vneg.s16 q6, q7 @ -tc2
-+ vmovn.i32 d8, q3
-+ vshrn.i32 d6, q3, #16
-+ vand d6, d8
-+ vmov r5, r6, d0 @ r5 = dp0'+dp3' dp0+dp3 r6 = dq0'+dq3' dq0+dq3
-+ vmov r8, s12 @ (d6) r8 = mask of strong filtering blocks (16b/block)
-+ vadd.i16 q0, \Q11, q12 @ p0 + q0
-+ ands r9, r7, r8
-+ beq 1f
-+
-+ vadd.i16 q2, q0, q10 @ p1 + p0 + q0
-+ vadd.i16 q3, q0, q13 @ p0 + q0 + q1
-+ lsr r3, r9, #16
-+ vadd.i16 q1, q2, q9 @ p2 + p1 + p0 + q0 (new P1 before clipping)
-+ vadd.i16 q4, q3, q14 @ p0 + q0 + q1 + q2 (new Q1 before clipping)
-+ vadd.i16 q0, q8, q9 @ p3 + p2
-+ vadd.i16 q5, \Q15, q14 @ q2 + q3
-+ vadd.i16 q2, q1 @ p2 + 2 * p1 + 2 * p0 + 2 * q0
-+ vadd.i16 q3, q4 @ 2 * p0 + 2 * q0 + 2 * q1 + q2
-+ vshl.i16 q0, #1 @ 2 * p3 + 2 * p2
-+ vshl.i16 q5, #1 @ 2 * q2 + 2 * q3
-+ vadd.i16 q0, q1 @ 2 * p3 + 3 * p2 + p1 + p0 + q0 (new P2 before clipping)
-+ vadd.i16 q5, q4 @ p0 + q0 + q1 + 3 * q2 + 2 * q3 (new Q2 before clipping)
-+ vadd.i16 q2, q13 @ p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 (new P0 before clipping)
-+ vadd.i16 q3, q10 @ p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 (new Q0 before clipping)
-+ vrshr.s16 q0, #3 @ scale, with rounding
-+ vrshr.s16 q5, #3
-+ vrshr.s16 q1, #2
-+ vrshr.s16 q4, #2
-+ vrshr.s16 q2, #3
-+ vrshr.s16 q3, #3
-+ vsub.i16 q0, q9 @ find difference
-+ vsub.i16 q5, q14
-+ vsub.i16 q1, q10
-+ vsub.i16 q4, q13
-+ vsub.i16 q2, \Q11
-+ vsub.i16 q3, q12
-+ vmax.s16 q0, q6 @ clip difference to -tc2 .. tc2
-+ vmax.s16 q5, q6
-+ vmax.s16 q1, q6
-+ vmax.s16 q4, q6
-+ vmax.s16 q2, q6
-+ vmax.s16 q3, q6
-+ vdup.16 d12, r9 @ expand mask, reuse q6 due to register pressure
-+ vdup.16 d13, r3
-+ vmin.s16 q0, q7
-+ vmin.s16 q5, q7
-+ vmin.s16 q1, q7
-+ vmin.s16 q4, q7
-+ vmin.s16 q2, q7
-+ vmin.s16 q3, q7
-+ vadd.i16 q0, q9 @ apply difference
-+ vadd.i16 q5, q14
-+ vadd.i16 q1, q10
-+ vadd.i16 q4, q13
-+ vadd.i16 q2, \Q11
-+ vadd.i16 q3, q12
-+ vbit q9, q0, q6 @ apply filtered values according to mask
-+ vbit q14, q5, q6
-+ vbit q10, q1, q6
-+ vbit q13, q4, q6
-+ vbit \Q11, q2, q6
-+ vbit q12, q3, q6
-+ vneg.s16 q6, q7 @ restore -tc2
-+
-+1:
-+ bics r9, r7, r8
-+ beq 2f
-+
-+ vsub.i16 q0, q12, \Q11 @ q0 - p0
-+ vsub.i16 q1, q13, q10 @ q1 - p1
-+ lsr r3, r9, #16
-+ vshl.i16 q2, q0, #3
-+ lsr r7, r5, #16
-+ vadd.i16 q3, q0, q2 @ 9 * (q0 - p0)
-+ lsr r8, r6, #16
-+ vshl.i16 q2, q1, #1
-+ vadd.i16 q4, q1, q2 @ 3 * (q1 - p1)
-+ vshr.s16 q6, #1 @ -tc = -tc2 >> 1
-+ vsub.i16 q5, q3, q4
-+ vrhadd.s16 q1, q9, \Q11 @ (p2 + p0 + 1) >> 1
-+ vrhadd.s16 q3, q14, q12 @ (q2 + q0 + 1) >> 1
-+ vrshr.s16 q5, #4 @ delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4
-+ vsub.i16 q1, q10 @ ((p2 + p0 + 1) >> 1) - p1
-+ vsub.i16 q3, q13 @ ((q2 + q0 + 1) >> 1) - q1
-+ vmax.s16 q6, q5 @
-+ vshr.s16 q4, q7, #1 @ tc = tc2 >> 1
-+ vdup.16 q0, r2 @ beta
-+ vmin.s16 q6, q4 @ delta0 clamped to [-tc, tc]
-+ vshr.s16 q4, #1 @ tc_2 = tc >> 1
-+ vhadd.s16 q1, q6 @ (((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1
-+ vhsub.s16 q3, q6 @ (((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1
-+ vshr.s16 q2, q0, #1 @ beta >> 1
-+ vadd.i16 q2, q0 @ beta + (beta >> 1)
-+ vneg.s16 q0, q4 @ -tc_2
-+ vabs.s16 q5, q5 @ abs(original delta0)
-+ vshr.s16 q2, #3 @ (beta + (beta >> 1)) >> 3
-+ vmax.s16 q1, q0
-+ vmax.s16 q3, q0
-+ vshl.s16 q0, q7, #2 @ 8 * tc
-+ vadd.i16 q7, q0 @ 10 * tc
-+ vdup.16 d0, r9
-+ vdup.16 d1, r3 @ q0 = mask of blocks to apply filtering
-+ vmin.s16 q1, q4 @ deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2)
-+ vmin.s16 q3, q4 @ deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 + delta0) >> 1, -tc_2, tc_2)
-+ vdup.16 d8, r5 @ dp0 + dp3
-+ vdup.16 d9, r7 @ dp0' + dp3'
-+ vcgt.s16 q7, q5 @ if ((10 * tc) > abs(delta0))
-+ vdup.16 d10, r6 @ dq0 + dq3
-+ vdup.16 d11, r8 @ dq0' + dq3'
-+ vand q7, q0 @ AND block and line masks
-+ vcgt.s16 q4, q2, q4 @ if (((beta + (beta >> 1)) >> 3) > dp0 + dp3), i.e. if (nd_p > 1)
-+ vadd.i16 q0, q1, q10 @ p1 + deltap1
-+ vcgt.s16 q5, q2, q5 @ if (((beta + (beta >> 1)) >> 3) > dq0 + dq3), i.e. if (nd_q > 1)
-+ vadd.i16 q3, q3, q13 @ q1 + deltaq1
-+ vadd.i16 q1, \Q11, q6 @ p0 + delta0
-+ vsub.i16 q2, q12, q6 @ q0 - delta0
-+ vand q4, q7 @ AND nd_p test with block/line masks
-+ vand q5, q7 @ AND nd_q test with block/line masks
-+ vbit q10, q0, q4
-+ vbit \Q11, q1, q7
-+ vbit q12, q2, q7
-+ vbit q13, q3, q5
-+
-+2:
-+.if \bit_depth == 8
-+ vmovn.i16 d16, q8
-+ vmovn.i16 d23, \Q15
-+ neg r1, r1
-+ vqmovun.s16 d17, q9
-+ vqmovun.s16 d18, q10
-+ vqmovun.s16 d19, \Q11
-+ lsls r10, #31
-+ vqmovun.s16 d20, q12
-+ vqmovun.s16 d21, q13
-+ vqmovun.s16 d22, q14
-+.else
-+ vmov.i16 q0, #0
-+ vmov.i16 q1, #(1 << \bit_depth - 1)
-+ @ q8 & q15 should be unaltered and so don't require clipping
-+ neg r1, r1
-+ vmax.s16 q9, q0
-+ vmax.s16 q10, q0
-+ vmax.s16 q11, q0
-+ vmax.s16 q12, q0
-+ vmax.s16 q13, q0
-+ vmax.s16 q14, q0
-+ lsls r10, #31
-+ vmin.s16 q9, q1
-+ vmin.s16 q10, q1
-+ vmin.s16 q11, q1
-+ vmin.s16 q12, q1
-+ vmin.s16 q13, q1
-+ vmin.s16 q14, q1
-+.endif
-+ bx lr
-+.endm
-+
-+function hevc_loop_filter_luma_body
-+ m_filter_luma 8, q15, q11
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter_luma_neon_8(
-+@ uint8_t *_pix, [r0]
-+@ ptrdiff_t _stride, [r1]
-+@ int _beta, [r2]
-+@ int *_tc, [r3]
-+@ uint8_t *_no_p, [sp+0]
-+@ uint8_t *_no_q) [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_8, export=1
-+ hevc_loop_filter_luma_start
-+
-+ sub r4, r0, #4
-+ b .Lv_loop_luma_common
-+endfunc
-+
-+@ void ff_hevc_rpi_v_loop_filter2_luma_neon(
-+@ uint8_t * pix_r, [r0]
-+@ ptrdiff_t _stride, [r1]
-+@ int _beta, [r2]
-+@ int tc2, [r3]
-+@ int no_f, [sp+0]
-+@ uint8_t * pix_l) [sp+4]
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_8, export=1
-+ cmp r3, #0
-+ it eq
-+ bxeq lr
-+ push {r4-r10,lr} @ 32 bytes
-+ ldr r4, [sp, #36]
-+ ldr r10, [sp, #32]
-+
-+.Lv_loop_luma_common:
-+ vpush {d8-d15}
-+
-+ @ It's slightly faster to do unlaned loads and transpose in the
-+ @ 8-bit case, even though it needs more instructions, because
-+ @ VLD4.8 is a really slow way to read from memory.
-+ vld1.32 {d16[0]}, [r4:32], r1
-+ vld1.32 {d20[0]}, [r0:32], r1
-+ vld1.32 {d16[1]}, [r4:32], r1
-+ vld1.32 {d20[1]}, [r0:32], r1
-+ vld1.32 {d17[0]}, [r4:32], r1
-+ vld1.32 {d21[0]}, [r0:32], r1
-+ vld1.32 {d17[1]}, [r4:32], r1
-+ vld1.32 {d21[1]}, [r0:32], r1
-+ vld1.32 {d18[0]}, [r4:32], r1
-+ vld1.32 {d22[0]}, [r0:32], r1
-+ vld1.32 {d18[1]}, [r4:32], r1
-+ vld1.32 {d22[1]}, [r0:32], r1
-+ vld1.32 {d19[0]}, [r4:32], r1
-+ vld1.32 {d23[0]}, [r0:32], r1
-+ vld1.32 {d19[1]}, [r4:32]
-+ vld1.32 {d23[1]}, [r0:32]
-+ vuzp.16 q8, q9
-+ vuzp.16 q10, q11
-+ vuzp.8 q8, q9
-+ vuzp.8 q10, q11
-+ vswp d17, d18
-+ vswp d21, d22
-+
-+ bl hevc_loop_filter_luma_body
-+
-+ add r6, r4, r1
-+ add r2, r0, r1
-+ lsl r1, #1
-+
-+ vpop {d8-d15}
-+
-+ @ no_p[1]
-+ bmi 1f
-+ vst4.8 {d16[7],d17[7],d18[7],d19[7]}, [r4:32], r1
-+ vst4.8 {d16[6],d17[6],d18[6],d19[6]}, [r6:32], r1
-+ vst4.8 {d16[5],d17[5],d18[5],d19[5]}, [r4:32], r1
-+ vst4.8 {d16[4],d17[4],d18[4],d19[4]}, [r6:32], r1
-+
-+ vst4.8 {d16[3],d17[3],d18[3],d19[3]}, [r4:32], r1
-+ vst4.8 {d16[2],d17[2],d18[2],d19[2]}, [r6:32], r1
-+ vst4.8 {d16[1],d17[1],d18[1],d19[1]}, [r4:32], r1
-+ vst4.8 {d16[0],d17[0],d18[0],d19[0]}, [r6:32]
-+1:
-+ @ no_q[1]
-+ bcs 1f
-+ vst4.8 {d20[7],d21[7],d22[7],d23[7]}, [r0:32], r1
-+ vst4.8 {d20[6],d21[6],d22[6],d23[6]}, [r2:32], r1
-+ vst4.8 {d20[5],d21[5],d22[5],d23[5]}, [r0:32], r1
-+ vst4.8 {d20[4],d21[4],d22[4],d23[4]}, [r2:32], r1
-+
-+ vst4.8 {d20[3],d21[3],d22[3],d23[3]}, [r0:32], r1
-+ vst4.8 {d20[2],d21[2],d22[2],d23[2]}, [r2:32], r1
-+ vst4.8 {d20[1],d21[1],d22[1],d23[1]}, [r0:32], r1
-+ vst4.8 {d20[0],d21[0],d22[0],d23[0]}, [r2:32]
-+1:
-+ pop {r4-r10,pc}
-+
-+.Lbypasswrite:
-+ vpop {d8-d15}
-+ pop {r4-r10,pc}
-+endfunc
-+
-+.macro m_filter_v_luma_16 bit_depth
-+ vpush {d8-d15}
-+
-+ @ Uses slightly fewer instructions to do laned loads than unlaned
-+ @ and transpose. This also means that we can use the same code for
-+ @ both split & unsplit deblock
-+ vld4.16 {d16[0], d18[0], d20[0], d22[0]}, [r4], r1
-+ vld4.16 {d24[0], d26[0], d28[0], d30[0]}, [r0], r1
-+
-+ vld4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+ vld4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+
-+ vld4.16 {d16[2], d18[2], d20[2], d22[2]}, [r4], r1
-+ vld4.16 {d24[2], d26[2], d28[2], d30[2]}, [r0], r1
-+
-+ vld4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+ vld4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+
-+ vld4.16 {d17[0], d19[0], d21[0], d23[0]}, [r4], r1
-+ vld4.16 {d25[0], d27[0], d29[0], d31[0]}, [r0], r1
-+
-+ vld4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+ vld4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+
-+ vld4.16 {d17[2], d19[2], d21[2], d23[2]}, [r4], r1
-+ vld4.16 {d25[2], d27[2], d29[2], d31[2]}, [r0], r1
-+
-+ vld4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4]
-+ vld4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0]
-+
-+ bl hevc_loop_filter_luma_body_\bit_depth
-+
-+ add r6, r4, r1
-+ add r2, r0, r1
-+ lsl r1, #1
-+
-+ vpop {d8-d15}
-+
-+ @ p[1]
-+ bmi 1f
-+ vst4.16 {d17[3], d19[3], d21[3], d23[3]}, [r4], r1
-+ vst4.16 {d17[2], d19[2], d21[2], d23[2]}, [r6], r1
-+ vst4.16 {d17[1], d19[1], d21[1], d23[1]}, [r4], r1
-+ vst4.16 {d17[0], d19[0], d21[0], d23[0]}, [r6], r1
-+ vst4.16 {d16[3], d18[3], d20[3], d22[3]}, [r4], r1
-+ vst4.16 {d16[2], d18[2], d20[2], d22[2]}, [r6], r1
-+ vst4.16 {d16[1], d18[1], d20[1], d22[1]}, [r4], r1
-+ vst4.16 {d16[0], d18[0], d20[0], d22[0]}, [r6]
-+1:
-+ @ q[1]
-+ bcs 1f
-+ vst4.16 {d25[3], d27[3], d29[3], d31[3]}, [r0], r1
-+ vst4.16 {d25[2], d27[2], d29[2], d31[2]}, [r2], r1
-+ vst4.16 {d25[1], d27[1], d29[1], d31[1]}, [r0], r1
-+ vst4.16 {d25[0], d27[0], d29[0], d31[0]}, [r2], r1
-+ vst4.16 {d24[3], d26[3], d28[3], d30[3]}, [r0], r1
-+ vst4.16 {d24[2], d26[2], d28[2], d30[2]}, [r2], r1
-+ vst4.16 {d24[1], d26[1], d28[1], d30[1]}, [r0], r1
-+ vst4.16 {d24[0], d26[0], d28[0], d30[0]}, [r2]
-+1:
-+ pop {r4-r10,pc}
-+.endm
-+
-+
-+
-+
-+@ void (*hevc_h_loop_filter_luma)(uint8_t *pix, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int beta, [r2]
-+@ int32_t *tc, [r3]
-+@ uint8_t *no_p, sp[0]
-+@ uint8_t *no_q); sp[4]
-+@
-+@ Src should always be on 8 byte boundry & all in the same slice
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_8, export=1
-+ hevc_loop_filter_luma_start
-+ b .Lh_loop_filter_luma_common_8
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_8, export=1
-+ cmp r3, #0
-+ it eq
-+ bxeq lr
-+ push {r4-r10,lr} @ 32 bytes
-+ ldr r10, [sp, #32]
-+
-+.Lh_loop_filter_luma_common_8:
-+ sub r4, r0, r1, lsl #2
-+ add r0, r4, r1
-+ lsl r1, #1
-+ vpush {d8-d15}
-+
-+ vld1.8 {d16}, [r4], r1
-+ vld1.8 {d17}, [r0], r1
-+ vld1.8 {d18}, [r4], r1
-+ vld1.8 {d19}, [r0], r1
-+ vld1.8 {d20}, [r4], r1
-+ vld1.8 {d21}, [r0], r1
-+ vld1.8 {d22}, [r4]
-+ vld1.8 {d23}, [r0]
-+
-+ bl hevc_loop_filter_luma_body
-+
-+ add r0, r0, r1, lsl #1
-+ add r2, r4, r1, lsl #1
-+ add r6, r4, r1, asr #1
-+ vpop {d8-d15}
-+
-+ @ P2-P0
-+ bcs 1f
-+ vst1.8 {d22}, [r4], r1
-+ vst1.8 {d21}, [r6]
-+ vst1.8 {d20}, [r4]
-+1:
-+ @ Q0-Q2
-+ bmi 1f
-+ vst1.8 {d19}, [r0], r1
-+ vst1.8 {d18}, [r2]
-+ vst1.8 {d17}, [r0]
-+1:
-+ pop {r4-r10,pc}
-+endfunc
-+
-+
-+.macro m_filter_h_luma_16 bit_depth
-+ sub r4, r0, r1, lsl #2
-+ add r0, r4, r1
-+ lsl r1, #1
-+ vpush {d8-d15}
-+
-+ vld1.16 { q8}, [r4], r1
-+ vld1.16 { q9}, [r0], r1
-+ vld1.16 {q10}, [r4], r1
-+ vld1.16 {q11}, [r0], r1
-+ vld1.16 {q12}, [r4], r1
-+ vld1.16 {q13}, [r0], r1
-+ vld1.16 {q14}, [r4]
-+ vld1.16 {q15}, [r0]
-+
-+ bl hevc_loop_filter_luma_body_\bit_depth
-+
-+ add r0, r0, r1, lsl #1
-+ add r2, r4, r1, lsl #1
-+ add r6, r4, r1, asr #1
-+ vpop {d8-d15}
-+
-+ @ P2-P0
-+ bcs 1f
-+ vst1.16 {q14}, [r4], r1
-+ vst1.16 {q13}, [r6]
-+ vst1.16 {q12}, [r4]
-+1:
-+ bmi 1f
-+ vst1.16 {q11}, [r0], r1
-+ vst1.16 {q10}, [r2]
-+ vst1.16 { q9}, [r0]
-+1:
-+ pop {r4-r10,pc}
-+.endm
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon(uint8_t * src_r, // r0
-+@ unsigned int stride, // r1
-+@ uint32_t tc4, // r2
-+@ unsigned int no_f); // r3
-+@
-+@ no_f
-+@ 0 tl P0
-+@ 1 tr P1
-+@ 2 bl Q0
-+@ 3 br Q1
-+@
-+@ Probably not worth having the P/Qa only special case in this direction
-+@ Given layout we won't save any memory reads or avoid any cache dirtying
-+@ We would save a bit of computation but I expect the partials to be less
-+@ common in the H direction than V due to how we arrange deblock.
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_8, export=1
-+ sub r12, r0, r1
-+ cmp r2, #0
-+ it eq
-+ bxeq lr
-+ vld1.8 {d26,d27}, [r0]
-+ lsl r1, #1
-+ sub r0, r1
-+ vld1.8 {d18,d19}, [r12], r1
-+ vld1.8 {d16,d17}, [r0], r1
-+ vld1.8 {d28,d29}, [r12]
-+
-+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d26, d27, d28, d29, \
-+ "sub r12, r0, r1, asr #1"
-+
-+ lsls r3, #29 @ b2 -> N, b3 -> C
-+ it pl
-+ vstrpl d26, [r0, #0]
-+ it cc
-+ vstrcc d27, [r0, #8]
-+ lsls r3, #2 @ b0 -> N, b1 -> C
-+ it pl
-+ vstrpl d18, [r12, #0]
-+ it cc
-+ vstrcc d19, [r12, #8]
-+ bx lr
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src_r, // r0
-+@ unsigned int stride, // r1
-+@ uint32_t tc4, // r2
-+@ unsigned int no_f); // r3
-+@
-+@ no-F = b0:no_p[0], b1:no_p[1], b2:no_q[0], b3:no_q[1]
-+@
-+@ Macro here actual function near bottom
-+
-+.macro m_filter_h_uv_16 bit_depth
-+ sub r12, r0, r1
-+ cmp r2, #0
-+ it eq
-+ bxeq lr
-+ vld1.16 {q12, q13}, [r0]
-+ lsl r1, #1
-+ sub r0, r1
-+ vld1.16 {q10, q11}, [r12], r1
-+ vld1.16 {q8, q9 }, [r0], r1
-+ vld1.16 {q14, q15}, [r12]
-+
-+ hevc_loop_filter_uv_body2_16 q8, q9, q10, q11, q12, q13, q14, q15, \bit_depth, \
-+ "sub r12, r0, r1, asr #1", \
-+ "cmp r3, #0"
-+
-+ bne 1f
-+ vst1.16 {q10, q11}, [r12]
-+ vst1.16 {q12, q13}, [r0]
-+ bx lr
-+
-+ @ At least one no_f bit is set
-+ @ Which means we need to break this apart in an ugly fashion
-+1:
-+ lsls r3, #29 @ b2 -> N, b3 -> C
-+ itt pl
-+ vstrpl d24, [r0, #0]
-+ vstrpl d25, [r0, #8]
-+ itt cc
-+ vstrcc d26, [r0, #16]
-+ vstrcc d27, [r0, #24]
-+ lsls r3, #2 @ b0 -> N, b1 -> C
-+ itt pl
-+ vstrpl d20, [r12, #0]
-+ vstrpl d21, [r12, #8]
-+ itt cc
-+ vstrcc d22, [r12, #16]
-+ vstrcc d23, [r12, #24]
-+ bx lr
-+.endm
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
-+@ unsigned int stride, // r1
-+@ uint32_t tc4, // r2
-+@ uint8_t * src_l, // r3
-+@ unsigned int no_f); // sp[0]
-+@
-+@ no_f:
-+@ 0 tl P0
-+@ 1 tr Q0
-+@ 2 bl P1
-+@ 3 br Q1
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_8, export=1
-+ cmp r2, #0
-+ it eq
-+ bxeq lr
-+ push {lr}
-+ vld2.16 {d16[0], d18[0]}, [r3], r1
-+ vld2.16 {d20[0], d22[0]}, [r0], r1
-+
-+ cmp r2, #0x10000
-+ vld2.16 {d16[1], d18[1]}, [r3], r1
-+ vld2.16 {d20[1], d22[1]}, [r0], r1
-+
-+ vld2.16 {d16[2], d18[2]}, [r3], r1
-+ vld2.16 {d20[2], d22[2]}, [r0], r1
-+
-+ vld2.16 {d16[3], d18[3]}, [r3], r1
-+ vld2.16 {d20[3], d22[3]}, [r0], r1
-+ blo 10f
-+
-+ vld2.16 {d17[0], d19[0]}, [r3], r1
-+ vld2.16 {d21[0], d23[0]}, [r0], r1
-+
-+ sub ip, r0, r3
-+ vld2.16 {d17[1], d19[1]}, [r3], r1
-+ vld2.16 {d21[1], d23[1]}, [r0], r1
-+
-+ cmp ip, #4
-+ vld2.16 {d17[2], d19[2]}, [r3], r1
-+ vld2.16 {d21[2], d23[2]}, [r0], r1
-+
-+ vld2.16 {d17[3], d19[3]}, [r3]
-+ vld2.16 {d21[3], d23[3]}, [r0]
-+
-+ hevc_loop_filter_uv_body2 d16, d17, d18, d19, d20, d21, d22, d23 \
-+ "ldr lr, [sp, #4]", \
-+ "neg r1, r1", \
-+ "it eq; cmpeq lr, #0", \
-+ "add r3, #2", \
-+ "add ip, r3, r1", \
-+ "add r2, r0, r1", \
-+ "lsl r1, #1"
-+
-+ bne 1f
-+
-+@ Much/most of the time r0 == r3 + 4 and no_f == 0
-+@ so it is worth having this special case
-+ vst2.16 {d19[3], d21[3]}, [r3], r1 @ P0b, Q0b
-+ vst2.16 {d19[2], d21[2]}, [ip], r1
-+ vst2.16 {d19[1], d21[1]}, [r3], r1
-+ vst2.16 {d19[0], d21[0]}, [ip], r1
-+ vst2.16 {d18[3], d20[3]}, [r3], r1 @ P0a, Q0a
-+ vst2.16 {d18[2], d20[2]}, [ip], r1
-+ vst2.16 {d18[1], d20[1]}, [r3]
-+ vst2.16 {d18[0], d20[0]}, [ip]
-+ pop {pc}
-+
-+@ Either split or partial
-+1:
-+ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+ ittt cs
-+ addcs r0, r0, r1, lsl #1
-+ addcs r2, r2, r1, lsl #1
-+ bcs 1f
-+ @ Q0b
-+ vst1.16 {d21[3]}, [r0], r1
-+ vst1.16 {d21[2]}, [r2], r1
-+ vst1.16 {d21[1]}, [r0], r1
-+ vst1.16 {d21[0]}, [r2], r1
-+1:
-+ ittt mi
-+ addmi r3, r3, r1, lsl #1
-+ addmi ip, ip, r1, lsl #1
-+ bmi 1f
-+ @ P0b
-+ vst1.16 {d19[3]}, [r3], r1
-+ vst1.16 {d19[2]}, [ip], r1
-+ vst1.16 {d19[1]}, [r3], r1
-+ vst1.16 {d19[0]}, [ip], r1
-+1:
-+ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+ bcs 1f
-+ @ Q0a
-+ vst1.16 {d20[3]}, [r0], r1
-+ vst1.16 {d20[2]}, [r2], r1
-+ vst1.16 {d20[1]}, [r0]
-+ vst1.16 {d20[0]}, [r2]
-+1:
-+ it mi
-+ popmi {pc}
-+ @ P0a
-+ vst1.16 {d18[3]}, [r3], r1
-+ vst1.16 {d18[2]}, [ip], r1
-+ vst1.16 {d18[1]}, [r3]
-+ vst1.16 {d18[0]}, [ip]
-+ pop {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+ @ As we have post inced r0/r3 in the load the easiest thing to do is
-+ @ to subtract and write forwards, rather than backwards (as above)
-+ @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+ hevc_loop_filter_uv_body1 d16, d18, d20, d22 \
-+ "ldr lr, [sp, #4]", \
-+ "add r3, #2", \
-+ "sub r0, r0, r1, lsl #2", \
-+ "sub r3, r3, r1, lsl #2", \
-+ "lsls lr, #31", \
-+ "add r2, r0, r1", \
-+ "add ip, r3, r1", \
-+ "lsl r1, #1"
-+
-+ bcs 3f
-+ @ Q0a
-+ vst1.16 {d20[0]}, [r0], r1
-+ vst1.16 {d20[1]}, [r2], r1
-+ vst1.16 {d20[2]}, [r0]
-+ vst1.16 {d20[3]}, [r2]
-+3:
-+ it mi
-+ popmi {pc}
-+ @ P0a
-+ vst1.16 {d18[0]}, [r3], r1
-+ vst1.16 {d18[1]}, [ip], r1
-+ vst1.16 {d18[2]}, [r3]
-+ vst1.16 {d18[3]}, [ip]
-+ pop {pc}
-+
-+endfunc
-+
-+
-+@ void ff_hevc_rpi_v_loop_filter_uv2_neon(uint8_t * src_r, // r0
-+@ unsigned int stride, // r1
-+@ uint32_t tc4, // r2
-+@ uint8_t * src_l, // r3
-+@ unsigned int no_f); // sp[0]
-+@
-+
-+@ no_f
-+@ 0 tl P0a
-+@ 1 tr Q0a
-+@ 2 bl P0b
-+@ 3 br Q0b
-+
-+@ P1: q8, q12
-+@ P0: q9, q13
-+@ Q0: q10, q14
-+@ Q1: q11, q15
-+
-+.macro m_filter_v_uv2_16 bit_depth
-+ cmp r2, #0
-+ it eq
-+ bxeq lr
-+ push {lr}
-+ vld2.32 {d16[0], d18[0]}, [r3], r1
-+ vld2.32 {d20[0], d22[0]}, [r0], r1
-+
-+ cmp r2, #0x10000
-+ vld2.32 {d16[1], d18[1]}, [r3], r1
-+ vld2.32 {d20[1], d22[1]}, [r0], r1
-+
-+ vld2.32 {d17[0], d19[0]}, [r3], r1
-+ vld2.32 {d21[0], d23[0]}, [r0], r1
-+
-+ vld2.32 {d17[1], d19[1]}, [r3], r1
-+ vld2.32 {d21[1], d23[1]}, [r0], r1
-+ blo 10f
-+
-+ vld2.32 {d24[0], d26[0]}, [r3], r1
-+ vld2.32 {d28[0], d30[0]}, [r0], r1
-+
-+ sub ip, r0, r3
-+ vld2.32 {d24[1], d26[1]}, [r3], r1
-+ vld2.32 {d28[1], d30[1]}, [r0], r1
-+
-+ cmp ip, #8
-+ vld2.32 {d25[0], d27[0]}, [r3], r1
-+ vld2.32 {d29[0], d31[0]}, [r0], r1
-+
-+ vld2.32 {d25[1], d27[1]}, [r3]
-+ vld2.32 {d29[1], d31[1]}, [r0]
-+
-+ hevc_loop_filter_uv_body2_16 q8, q12, q9, q13, q10, q14, q11, q15, \bit_depth, \
-+ "ldr lr, [sp, #4]", \
-+ "neg r1, r1", \
-+ "it eq; cmpeq lr, #0", \
-+ "add r3, #4", \
-+ "add ip, r3, r1", \
-+ "add r2, r0, r1", \
-+ "lsl r1, #1"
-+
-+ bne 1f
-+
-+@ Much/most of the time r0 == r3 + 8 and no_f == 0
-+@ so it is worth having this special case
-+ vst2.32 {d27[1], d29[1]}, [r3], r1 @ P0b, Q0b
-+ vst2.32 {d27[0], d29[0]}, [ip], r1
-+ vst2.32 {d26[1], d28[1]}, [r3], r1
-+ vst2.32 {d26[0], d28[0]}, [ip], r1
-+ vst2.32 {d19[1], d21[1]}, [r3], r1 @ P0a, Q0a
-+ vst2.32 {d19[0], d21[0]}, [ip], r1
-+ vst2.32 {d18[1], d20[1]}, [r3]
-+ vst2.32 {d18[0], d20[0]}, [ip]
-+ pop {pc}
-+
-+@ Either split or partial
-+1:
-+ lsls lr, #29 @ b3 (Q0b) -> C, b2 (P0b) -> N & b31, b1 (Q0a) -> b30, b0 (P0a) -> b29
-+ ittt cs
-+ addcs r0, r0, r1, lsl #1
-+ addcs r2, r2, r1, lsl #1
-+ bcs 1f
-+ @ Q0b
-+ vst1.32 {d29[1]}, [r0], r1
-+ vst1.32 {d29[0]}, [r2], r1
-+ vst1.32 {d28[1]}, [r0], r1
-+ vst1.32 {d28[0]}, [r2], r1
-+1:
-+ ittt mi
-+ addmi r3, r3, r1, lsl #1
-+ addmi ip, ip, r1, lsl #1
-+ bmi 1f
-+ @ P0b
-+ vst1.32 {d27[1]}, [r3], r1
-+ vst1.32 {d27[0]}, [ip], r1
-+ vst1.32 {d26[1]}, [r3], r1
-+ vst1.32 {d26[0]}, [ip], r1
-+1:
-+ lsls lr, #2 @ b30 (Q0a) -> C, b29 (P0a) -> N & b31
-+ bcs 1f
-+ @ Q0a
-+ vst1.32 {d21[1]}, [r0], r1
-+ vst1.32 {d21[0]}, [r2], r1
-+ vst1.32 {d20[1]}, [r0]
-+ vst1.32 {d20[0]}, [r2]
-+1:
-+ it mi
-+ popmi {pc}
-+ @ P0a
-+ vst1.32 {d19[1]}, [r3], r1
-+ vst1.32 {d19[0]}, [ip], r1
-+ vst1.32 {d18[1]}, [r3]
-+ vst1.32 {d18[0]}, [ip]
-+ pop {pc}
-+
-+@ Single lump (rather than double)
-+10:
-+ @ As we have post inced r0/r3 in the load the easiest thing to do is
-+ @ to subtract and write forwards, rather than backwards (as above)
-+ @ b0 (P0a) -> N, b1 (Q0a) -> C
-+
-+ hevc_loop_filter_uv_body1_16 q8, q9, q10, q11, \bit_depth, \
-+ "ldr lr, [sp, #4]", \
-+ "add r3, #4", \
-+ "sub r0, r0, r1, lsl #2", \
-+ "sub r3, r3, r1, lsl #2", \
-+ "lsls lr, #31", \
-+ "add r2, r0, r1", \
-+ "add ip, r3, r1", \
-+ "lsl r1, #1"
-+
-+ bcs 3f
-+ @ Q0a
-+ vst1.32 {d20[0]}, [r0], r1
-+ vst1.32 {d20[1]}, [r2], r1
-+ vst1.32 {d21[0]}, [r0]
-+ vst1.32 {d21[1]}, [r2]
-+3:
-+ it mi
-+ popmi {pc}
-+ @ P0a
-+ vst1.32 {d18[0]}, [r3], r1
-+ vst1.32 {d18[1]}, [ip], r1
-+ vst1.32 {d19[0]}, [r3]
-+ vst1.32 {d19[1]}, [ip]
-+ pop {pc}
-+.endm
-+
-+
-+@ The NEON version is faster under ideal circumstances (i.e. everything in L1)
-+@ But in real world testing it is ~20% slower, presumably due to code size
-+
-+#if 0 // NEON version
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ * int in_inc0, int in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+ mov ip, sp
-+ push {a1-a3,v1-v8,lr}
-+ ldm ip, {v1-v6}
-+ cmp a1, #2
-+ bls 2f
-+ vpush {d8-d13}
-+ sub v5, v5, #10
-+ sub v6, v6, #10
-+1:
-+ vld2.32 {d0[0], d2[0]}, [a3]!
-+ vld2.32 {d4[0], d6[0]}, [a4]!
-+ vmov.u8 q12, #0
-+ ldrb a2, [a3], #1
-+ ldrb ip, [a4], #1
-+ ldrb v8, [a3], #1
-+ ldrb lr, [a4], #1
-+ add a2, v1, a2, lsl #2
-+ vld1.8 {d24[0]}, [a3], v5
-+ add ip, v3, ip, lsl #2
-+ vld1.8 {d25[0]}, [a4], v6
-+ add v8, v2, v8, lsl #2
-+ vld1.32 {d16[0]}, [a2]
-+ add lr, v4, lr, lsl #2
-+ vld1.32 {d20[0]}, [ip]
-+ vld1.32 {d18[0]}, [v8]
-+ vld1.32 {d22[0]}, [lr]
-+
-+ vld2.32 {d0[1], d2[1]}, [a3]!
-+ vld2.32 {d4[1], d6[1]}, [a4]!
-+ ldrb a2, [a3], #1
-+ vmov.u16 d12, #1
-+ ldrb ip, [a4], #1
-+ vmov.u16 d13, #2
-+ ldrb v8, [a3], #1
-+ vmov.u16 d27, #4
-+ ldrb lr, [a4], #1
-+ add a2, v1, a2, lsl #2
-+ vld1.8 {d24[2]}, [a3], v5
-+ add ip, v3, ip, lsl #2
-+ vld1.8 {d25[2]}, [a4], v6
-+ add v8, v2, v8, lsl #2
-+ vld1.32 {d16[1]}, [a2]
-+ add lr, v4, lr, lsl #2
-+ vld1.32 {d20[1]}, [ip]
-+ vld1.32 {d18[1]}, [v8]
-+ vld1.32 {d22[1]}, [lr]
-+
-+ vld2.32 {d1[0], d3[0]}, [a3]!
-+ vld2.32 {d5[0], d7[0]}, [a4]!
-+ ldrb a2, [a3], #1
-+ ldrb ip, [a4], #1
-+ ldrb lr, [a4], #1
-+ ldrb v8, [a3], #1
-+ add a2, v1, a2, lsl #2
-+ vld1.8 {d24[4]}, [a3], v5
-+ add ip, v3, ip, lsl #2
-+ vld1.8 {d25[4]}, [a4], v6
-+ add v8, v2, v8, lsl #2
-+ vld1.32 {d17[0]}, [a2]
-+ add lr, v4, lr, lsl #2
-+ vld1.32 {d21[0]}, [ip]
-+ vld1.32 {d19[0]}, [v8]
-+ vld1.32 {d23[0]}, [lr]
-+
-+ vld2.32 {d1[1], d3[1]}, [a3]!
-+ vld2.32 {d5[1], d7[1]}, [a4]!
-+ ldrb a2, [a3], #1
-+ ldrb ip, [a4], #1
-+ ldrb v8, [a3], #1
-+ ldrb lr, [a4], #1
-+ add a2, v1, a2, lsl #2
-+ vld1.8 {d24[6]}, [a3], v5
-+ add ip, v3, ip, lsl #2
-+ vld1.8 {d25[6]}, [a4], v6
-+ add v8, v2, v8, lsl #2
-+ vld1.32 {d17[1]}, [a2]
-+ add lr, v4, lr, lsl #2
-+ vld1.32 {d21[1]}, [ip]
-+ vld1.32 {d19[1]}, [v8]
-+ vld1.32 {d23[1]}, [lr]
-+
-+ @ So now we have:
-+ @ q0.32[i] = curr[i].mv[0]
-+ @ q1.32[i] = curr[i].mv[1]
-+ @ q2.32[i] = neigh[i].mv[0]
-+ @ q3.32[i] = neigh[i].mv[1]
-+ @ q8.32[i] = curr_rpl0[curr[i].ref_idx[0]]
-+ @ q9.32[i] = curr_rpl1[curr[i].ref_idx[1]]
-+ @ q10.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+ @ q11.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+ @ d24.16[i] = curr[i].pred_flag
-+ @ d25.16[i] = neigh[i].pred_flag
-+
-+ vtst.16 d28, d24, d12
-+ vtst.16 d29, d24, d13
-+ vadd.i16 d8, d24, d12
-+ vadd.i16 d9, d25, d12
-+ vtst.16 d30, d25, d12
-+ vtst.16 d31, d25, d13
-+ veor d26, d8, d9
-+ ldr lr, [sp, 6*8 + 1*4]
-+ vmovl.s16 q4, d28
-+ vmovl.s16 q5, d29
-+ teq lr, #1
-+ vmovl.s16 q14, d30
-+ it ne
-+ lslne v1, lr, #1
-+ vmovl.s16 q15, d31
-+ it ne
-+ rsbne v2, v1, #32
-+ vbif q0, q1, q4
-+ vbif q2, q3, q14
-+ vbif q1, q0, q5
-+ vbif q3, q2, q15
-+ vabd.s16 q12, q0, q2
-+ vabd.s16 q2, q1
-+ vabd.s16 q0, q3
-+ vabd.s16 q1, q3
-+ vbif q8, q9, q4
-+ vbif q10, q11, q14
-+ vbif q9, q8, q5
-+ vbif q11, q10, q15
-+ vclt.u16 d6, d24, d27
-+ vclt.u16 d8, d2, d27
-+ vclt.u16 d7, d25, d27
-+ vclt.u16 d9, d3, d27
-+ vclt.u16 d2, d0, d27
-+ vclt.u16 d0, d4, d27
-+ vclt.u16 d3, d1, d27
-+ vclt.u16 d1, d5, d27
-+ vceq.i32 q12, q10, q8
-+ vceq.i32 q10, q9
-+ vceq.i32 q8, q11
-+ vceq.i32 q9, q11
-+ vshrn.i32 d6, q3, #8
-+ vshrn.i32 d7, q4, #8
-+ vshrn.i32 d8, q1, #8
-+ vshrn.i32 d9, q0, #8
-+ vmovn.i32 d4, q12
-+ vmovn.i32 d2, q10
-+ vmovn.i32 d3, q8
-+ vmovn.i32 d5, q9
-+ vand q2, q3
-+ vrev16.8 q3, q3
-+ vand q2, q3
-+ vand q1, q4
-+ vrev16.8 q4, q4
-+ vand q1, q4
-+ vand d4, d5
-+ vand d2, d3
-+ vbic d0, d12, d4
-+ vshr.u16 d26, #2
-+ vbic d0, d2
-+ vmov.i16 d1, #0x5555
-+ vorr d0, d26
-+ bne 10f
-+
-+ @ Merge results into result word, no duplicates
-+ vmov a2, s0
-+ vmov v8, s1
-+ vmov.u16 ip, d0[1]
-+ vmov.u16 lr, d0[3]
-+ lsl a2, #30
-+ lsl v8, #30
-+ lsl ip, #30
-+ lsl lr, #30
-+ orr a2, ip, a2, lsr #2
-+ orr v8, lr, v8, lsr #2
-+ orr a2, v8, a2, lsr #4
-+ subs a1, #4
-+ orr v7, a2, v7, lsr #8
-+ bhi 1b
-+
-+ mov a1, #32
-+ ldr a3, [sp, #6*8]
-+ vpop {d8-d13}
-+ sub a1, a1, a3, lsl #1
-+ mov a1, v7, lsr a1
-+ pop {a2-a4,v1-v8,pc}
-+10:
-+ @ Merge results into result word, with duplicates
-+ vmul.i16 d0, d1
-+ vmov a2, s0
-+ vmov v8, s1
-+ vmov.u16 ip, d0[1]
-+ vmov.u16 lr, d0[3]
-+ lsl a2, v2
-+ subs a1, #4
-+ lsl v8, v2
-+ lsl ip, v2
-+ lsl lr, v2
-+ ldr v2, [sp, #6*8 + 12*4 + 1*4]
-+T lsr a2, v1
-+T orr a2, ip, a2
-+A orr a2, ip, a2, lsr v1
-+ lsl ip, v1, #1
-+T lsr v8, v1
-+T orr v8, lr, v8
-+A orr v8, lr, v8, lsr v1
-+ lsl lr, v1, #2
-+T lsr a2, ip
-+T orr a2, v8, a2
-+A orr a2, v8, a2, lsr ip
-+ ldr v1, [sp, #6*8 + 12*4]
-+T lsr v7, lr
-+T orr v7, a2, v7
-+A orr v7, a2, v7, lsr lr
-+ bhi 1b
-+
-+ mov a1, #32
-+ ldrd a3, a4, [sp, #6*8]
-+ vpop {d8-d13}
-+ mls a1, a3, a4, a1
-+ mls a1, a3, a4, a1
-+ mov a1, v7, lsr a1
-+ pop {a2-a4,v1-v8,pc}
-+
-+
-+2:
-+ sub v5, v5, #10
-+ sub v6, v6, #10
-+ vmov.u8 d16, #0
-+ blo 3f
-+ vld2.32 {d0[0], d1[0]}, [a3]!
-+ vld2.32 {d2[0], d3[0]}, [a4]!
-+ ldrb a2, [a3], #1
-+ ldrb ip, [a4], #1
-+ ldrb lr, [a4], #1
-+ ldrb v8, [a3], #1
-+ add a2, v1, a2, lsl #2
-+ vld1.8 {d16[0]}, [a3], v5
-+ add ip, v3, ip, lsl #2
-+ vld1.8 {d16[4]}, [a4], v6
-+ add v8, v2, v8, lsl #2
-+ vld1.32 {d4[0]}, [a2]
-+ add lr, v4, lr, lsl #2
-+ vld1.32 {d5[0]}, [ip]
-+ vld1.32 {d6[0]}, [v8]
-+ vld1.32 {d7[0]}, [lr]
-+
-+3:
-+ vld2.32 {d0[1], d1[1]}, [a3]!
-+ vld2.32 {d2[1], d3[1]}, [a4]!
-+ ldrb a2, [a3], #1
-+ vmov.u16 d17, #1
-+ ldrb ip, [a4], #1
-+ vmov.u16 d18, #2
-+ ldrb v8, [a3], #1
-+ vmov.u16 d19, #4
-+ ldrb lr, [a4], #1
-+ add a2, v1, a2, lsl #2
-+ vld1.8 {d16[2]}, [a3], v5
-+ add ip, v3, ip, lsl #2
-+ vld1.8 {d16[6]}, [a4], v6
-+ add v8, v2, v8, lsl #2
-+ vld1.32 {d4[1]}, [a2]
-+ add lr, v4, lr, lsl #2
-+ vld1.32 {d5[1]}, [ip]
-+ vld1.32 {d6[1]}, [v8]
-+ vld1.32 {d7[1]}, [lr]
-+
-+ @ So now we have:
-+ @ d0.32[i] = curr[i].mv[0]
-+ @ d1.32[i] = curr[i].mv[1]
-+ @ d2.32[i] = neigh[i].mv[0]
-+ @ d3.32[i] = neigh[i].mv[1]
-+ @ d4.32[i] = curr_rpl0[curr[i].ref_idx[0]]
-+ @ d5.32[i] = neigh_rpl0[neigh[i].ref_idx[0]]
-+ @ d6.32[i] = curr_rpl1[curr[i].ref_idx[1]]
-+ @ d7.32[i] = neigh_rpl1[neigh[i].ref_idx[1]]
-+ @ d16.16[i] = curr[i].pred_flag
-+ @ d16.16[2+i] = neigh[i].pred_flag
-+
-+ vtst.16 d20, d16, d17
-+ vtst.16 d22, d16, d18
-+ vadd.i16 d30, d16, d17
-+ vswp d2, d3
-+ ldr lr, [sp, #1*4]
-+ vmovl.s16 q10, d20
-+ teq lr, #1
-+ vmovl.s16 q11, d22
-+ it ne
-+ lslne v1, lr, #1
-+ vbif d0, d1, d20
-+ vbif d4, d6, d20
-+ vbif d3, d2, d21
-+ vbif d5, d7, d21
-+ vbif d1, d0, d22
-+ vbif d6, d4, d22
-+ vbif d2, d3, d23
-+ vbif d7, d5, d23
-+ vshr.u16 d30, #2
-+ vabd.s16 d24, d0, d3
-+ vabd.s16 d25, d1, d2
-+ vabd.s16 q0, q0, q1
-+ vceq.i32 d2, d4, d5
-+ vceq.i32 d20, d5, d6
-+ vceq.i32 d21, d4, d7
-+ vceq.i32 d3, d6, d7
-+ vclt.u16 d6, d24, d19
-+ vclt.u16 d7, d25, d19
-+ vclt.u16 d22, d1, d19
-+ vclt.u16 d23, d0, d19
-+ vshrn.i32 d6, q3, #8
-+ vmovn.i32 d2, q1
-+ vshrn.i32 d7, q11, #8
-+ vmovn.i32 d3, q10
-+ vand q0, q3, q1
-+ it ne
-+ rsbne v2, v1, #32
-+ vrev16.8 q3, q3
-+ vand q0, q3
-+ vsra.u64 d30, #32
-+ vshr.u64 q1, q0, #32
-+ vand q0, q1
-+ vbic d0, d17, d0
-+ vand d30, d30, d17
-+ vbic d0, d1
-+ vmov.i16 d1, #0x5555
-+ vorr d0, d30
-+ bne 10f
-+
-+ @ Construct result word, no duplicates
-+ cmp a1, #2
-+ vmov.u16 a1, d0[1]
-+ vmov.u16 a2, d0[0]
-+ it eq
-+ orreq a1, a2, a1, lsl #2
-+ pop {a2-a4,v1-v8,pc}
-+10:
-+ @ Construct result word, with duplicates
-+ cmp a1, #2
-+ vmul.i16 d0, d1
-+ vmov a2, s0
-+ vmov.u16 a1, d0[1]
-+ lsl a2, #16
-+ pkhbt a1, a1, a1, lsl #16
-+ lsr a2, v2
-+ lsr a1, v2
-+T itt eq
-+T lsleq a1, v1
-+T orreq a1, a2, a1
-+A orreq a1, a2, a1, lsl v1
-+ pop {a2-a4,v1-v8,pc}
-+endfunc
-+
-+
-+
-+#else // non-NEON version
-+
-+
-+/* uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ * const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ * int in_inc0, in_inc1)
-+ */
-+function ff_hevc_rpi_deblocking_boundary_strengths_neon, export=1
-+ add ip, sp, #4*4
-+ push {a2-a4,v1-v8,lr}
-+ mov v6, #32
-+1: ldmdb ip, {v1-v4}
-+ ldrsb v5, [a3, #8] @ curr->ref_idx
-+ ldrsb v8, [a3, #9]
-+ ldrsb ip, [a4, #8] @ neigh->ref_idx
-+ ldrsb lr, [a4, #9]
-+ ldr v1, [v1, v5, lsl #2]
-+ ldrb v5, [a3, #10] @ curr->pred_flag
-+ ldr v2, [v2, v8, lsl #2]
-+ ldrb v8, [a4, #10] @ neigh->pred_flag
-+ ldr v3, [v3, ip, lsl #2]
-+ ldr v4, [v4, lr, lsl #2]
-+ teq v5, #3
-+ beq 20f
-+ teq v8, #3
-+ beq 90f
-+
-+ tst v5, #1
-+ itee ne
-+ ldrne v5, [a3, #0] @ curr->mv[0]
-+ moveq v1, v2
-+ ldreq v5, [a3, #4] @ curr->mv[1]
-+ tst v8, #1
-+ itee ne
-+ ldrne v8, [a4, #0] @ neigh->mv[0]
-+ moveq v3, v4
-+ ldreq v8, [a4, #4] @ neigh->mv[1]
-+ teq v1, v3
-+ bne 10f
-+ ldr lr, =0xFFFCFFFC
-+ ssub16 ip, v8, v5
-+ ssub16 v5, v5, v8
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ @ drop through
-+10: it ne
-+ movne v5, #1<<30
-+11:
-+ sub v6, v6, #2
-+T mov v7, v7, lsr #2
-+ subs a2, a2, #1
-+A orr v7, v5, v7, lsr #2
-+T orr v7, v5, v7
-+ bhi 11b
-+
-+ ldrd v3, v4, [sp, #16*4]
-+ ldr a2, [sp]
-+ add ip, sp, #16*4
-+ subs a1, a1, #1
-+ add a3, a3, v3
-+ add a4, a4, v4
-+ bhi 1b
-+ mov a1, v7, lsr v6
-+ pop {a2-a4,v1-v8,pc}
-+
-+20: teq v8, #3
-+ bne 10b
-+
-+ teq v1, v3
-+ it eq
-+ teqeq v2, v4
-+ bne 40f
-+ teq v1, v2
-+ bne 30f
-+
-+ ldrd v1, v2, [a3] @ curr->mv
-+ ldrd v3, v4, [a4] @ neigh->mv
-+ ldr lr, =0xFFFCFFFC
-+ ssub16 ip, v3, v1
-+ ssub16 v5, v1, v3
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ bne 25f
-+ ssub16 ip, v4, v2
-+ ssub16 v5, v2, v4
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ beq 11b
-+ @ drop through
-+25: ssub16 ip, v4, v1
-+ ssub16 v5, v1, v4
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ bne 10b
-+ ssub16 ip, v3, v2
-+ ssub16 v5, v2, v3
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ b 10b
-+
-+30: ldrd v1, v2, [a3] @ curr->mv
-+ ldrd v3, v4, [a4] @ neigh->mv
-+ ldr lr, =0xFFFCFFFC
-+ ssub16 ip, v3, v1
-+ ssub16 v5, v1, v3
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ bne 10b
-+ ssub16 ip, v4, v2
-+ ssub16 v5, v2, v4
-+ sel v5, v5, ip
-+ ands v5, v5, lr
-+ b 10b
-+
-+40: teq v1, v4
-+ ite eq
-+ teqeq v2, v3
-+ bne 10b
-+
-+ ldrd v1, v2, [a3] @ curr->mv
-+ ldrd v3, v4, [a4] @ neigh->mv
-+ ldr lr, =0xFFFCFFFC
-+ b 25b
-+
-+90:
-+ mov v5, #1<<30
-+ b 11b
-+endfunc
-+
-+
-+#endif
-+
-+
-+@ =============================================================================
-+@
-+@ 10 bit
-+
-+function hevc_loop_filter_luma_body_10
-+ m_filter_luma 10, q11, q15
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma_neon_10, export=1
-+ hevc_loop_filter_luma_start
-+ b .Lh_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_luma2_neon_10, export=1
-+ cmp r3, #0
-+ it eq
-+ bxeq lr
-+ push {r4-r10,lr} @ 32 bytes
-+ ldr r10, [sp, #32]
-+.Lh_loop_luma_common_10:
-+ m_filter_h_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma_neon_10, export=1
-+ hevc_loop_filter_luma_start
-+ sub r4, r0, #8
-+ b .Lv_loop_luma_common_10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_luma2_neon_10, export=1
-+ cmp r3, #0
-+ it eq
-+ bxeq lr
-+ push {r4-r10,lr} @ 32 bytes
-+ ldr r4, [sp, #36]
-+ ldr r10, [sp, #32]
-+
-+.Lv_loop_luma_common_10:
-+ m_filter_v_luma_16 10
-+endfunc
-+
-+function ff_hevc_rpi_h_loop_filter_uv_neon_10, export=1
-+ m_filter_h_uv_16 10
-+endfunc
-+
-+function ff_hevc_rpi_v_loop_filter_uv2_neon_10, export=1
-+ m_filter_v_uv2_16 10
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_idct_neon.S b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-new file mode 100644
-index 0000000000..db10da16d3
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_idct_neon.S
-@@ -0,0 +1,183 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+/* uses registers q8 - q13 for temp values */
-+.macro tr4_luma_shift shift
-+ vaddl.s16 q8, d28, d30 // c0 = src0 + src2
-+ vaddl.s16 q9, d30, d31 // c1 = src2 + src3
-+ vsubl.s16 q10, d28, d31 // c2 = src0 - src3
-+ vaddl.s16 q11, d28, d31 // src0 + src3
-+
-+ vmul.i32 q12, q8, d1[0] // 29 * c0
-+ vmul.i32 q13, q10, d2[0] // 55 * c2
-+ vmul.i32 q8, q8, d2[0] // 55 * c0
-+ vmull.s16 q14, d29, d0[0] // c3 = 74 * src1
-+
-+ vsubw.s16 q11, q11, d30 // src0 - src2 + src3
-+ vmla.i32 q12, q9, d2[0] // 29 * c0 + 55 * c1
-+ vmls.i32 q13, q9, d1[0] // 55 * c2 - 29 * c1
-+ vmla.i32 q8, q10, d1[0] // 55 * c0 + 29 * c2
-+
-+ vmul.i32 q11, q11, d0[0] // dst2 = 74 * (src0 - src2 + src3)
-+ vadd.i32 q12, q12, q14 // dst0 = 29 * c0 + 55 * c1 + c3
-+ vadd.i32 q13, q13, q14 // dst1 = 55 * c2 - 29 * c1 + c3
-+ vsub.i32 q8, q8, q14 // dst3 = 55 * c0 + 29 * c2 - c3
-+
-+ vqrshrn.s32 d28, q12, \shift
-+ vqrshrn.s32 d29, q13, \shift
-+ vqrshrn.s32 d30, q11, \shift
-+ vqrshrn.s32 d31, q8, \shift
-+.endm
-+
-+/* uses registers q8 - q11 for temp values */
-+.macro tr4_shift shift
-+ vmull.s16 q9, d29, d0[0] // 83 * src1
-+ vmull.s16 q8, d29, d0[1] // 36 * src1
-+ vshll.s16 q14, d28, #6 // 64 * src0
-+ vshll.s16 q10, d30, #6 // 64 * src2
-+ vmlal.s16 q9, d31, d0[1] // 83 * src1 + 36 * src3 o0
-+ vmlsl.s16 q8, d31, d0[0] // 36 * src1 - 83 * src3 o1
-+ vadd.s32 q11, q14, q10 // 64 * (src0 + src2) e0
-+ vsub.s32 q10, q14, q10 // 64 * (src0 - src2) e1
-+ vadd.s32 q14, q11, q9 // e0 + o0
-+ vadd.s32 q15, q10, q8 // e1 + o1
-+ vsub.s32 q8, q10, q8 // e1 - o1
-+ vsub.s32 q9, q11, q9 // e0 - o0
-+
-+ vqrshrn.s32 d28, q14, \shift
-+ vqrshrn.s32 d29, q15, \shift
-+ vqrshrn.s32 d30, q8, \shift
-+ vqrshrn.s32 d31, q9, \shift
-+.endm
-+
-+.macro tr8_process d0, d1, d2, d3, d4, d5, d6, d7, \
-+ tmp0, /* Q reg which doesn't alias with d4, d6 or d7 */ \
-+ tmp1, /* Q reg which doesn't alias with d7 or d0 */ \
-+ shift, I1, I2, I3
-+
-+ vmull.s16 q4, \d1, d1[1] // 89 * src1
-+ \I1
-+ vmull.s16 q5, \d1, d1[0] // 75 * src1
-+ \I2
-+ vmull.s16 q6, \d1, d1[3] // 50 * src1
-+ \I3
-+ vmull.s16 q7, \d1, d1[2] // 18 * src1
-+ vmlal.s16 q4, \d3, d1[0] // 75 * src3
-+ vmlsl.s16 q5, \d3, d1[2] //-18 * src3
-+ vmlsl.s16 q6, \d3, d1[1] //-89 * src3
-+ vmlsl.s16 q7, \d3, d1[3] //-50 * src3
-+
-+ // tr4
-+ vmull.s16 q1, \d2, d0[0] // 83 * src(1*2)
-+ vmull.s16 q2, \d2, d0[1] // 36 * src(1*2)
-+
-+ vmlal.s16 q4, \d5, d1[3] // 50 * src5
-+ vmlsl.s16 q5, \d5, d1[1] //-89 * src5
-+ vmlal.s16 q6, \d5, d1[2] // 18 * src5
-+ vmlal.s16 q7, \d5, d1[0] // 75 * src5
-+
-+ vshll.s16 q3, \d0, #6 // 64 * src(0*2)
-+ vshll.s16 \tmp0, \d4, #6 // 64 * src(2*2)
-+ vmlal.s16 q1, \d6, d0[1] // 83 * src(1*2) + 36 * src(3*2) o0
-+ vmlsl.s16 q2, \d6, d0[0] // 36 * src(1*2) - 83 * src(3*2) o1
-+ vadd.i32 \tmp1, q3, \tmp0 // 64 * (src(0*2) + src(2*2)) e0
-+ vsub.i32 \tmp0, q3, \tmp0 // 64 * (src(0*2) - src(2*2)) e1
-+
-+ vmlal.s16 q4, \d7, d1[2] // 18 * src7
-+ vmlsl.s16 q5, \d7, d1[3] //-50 * src7
-+ vmlal.s16 q6, \d7, d1[0] // 75 * src7
-+ vmlsl.s16 q7, \d7, d1[1] //-89 * src7
-+
-+ vsub.i32 q3, \tmp1, q1 // e0 - o0
-+ vadd.i32 \tmp1, \tmp1, q1 // e0 + o0
-+ vadd.i32 q1, \tmp0, q2 // e1 + o1
-+ vsub.i32 q2, \tmp0, q2 // e1 - o1
-+
-+ vadd.i32 \tmp0, \tmp1, q4 // e_8[0] + o_8[0], dst[0]
-+ vsub.i32 q4, \tmp1, q4 // e_8[0] - o_8[0], dst[7]
-+ vsub.i32 \tmp1, q3, q7 // e_8[3] - o_8[3], dst[4]
-+ vadd.i32 q7, q3, q7 // e_8[3] + o_8[3], dst[3]
-+ vadd.i32 q3, q1, q5 // e_8[1] + o_8[1], dst[1]
-+ vsub.i32 q5, q1, q5 // e_8[1] - o_8[1], dst[6]
-+ vsub.i32 q1, q2, q6 // e_8[2] - o_8[2], dst[5]
-+ vadd.i32 q6, q2, q6 // e_8[2] + o_8[2], dst[2]
-+ vqrshrn.s32 \d0, \tmp0, #\shift
-+ vqrshrn.s32 \d4, \tmp1, #\shift
-+ vqrshrn.s32 \d1, q3, #\shift
-+ vqrshrn.s32 \d5, q1, #\shift
-+ vqrshrn.s32 \d2, q6, #\shift
-+ vqrshrn.s32 \d6, q5, #\shift
-+ vqrshrn.s32 \d3, q7, #\shift
-+ vqrshrn.s32 \d7, q4, #\shift
-+.endm
-+
-+.macro tr8_vert d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, I1, I2, I3
-+ vld1.16 {\d0}, [r0 :64], r3
-+ vld1.16 {\d1}, [r2 :64], r3
-+ vld1.16 {\d2}, [r0 :64], r3
-+ vld1.16 {\d3}, [r2 :64], r3
-+ vld1.16 {\d4}, [r0 :64], r3
-+ vld1.16 {\d5}, [r2 :64], r3
-+ vld1.16 {\d6}, [r0 :64], r3
-+ vld1.16 {\d7}, [r2 :64], r3
-+
-+ tr8_process \
-+ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+ \q01, \q23, 7, "\I1", "\I2", "\I3"
-+.endm
-+
-+.macro tr8_horiz d0, d1, d2, d3, d4, d5, d6, d7, q01, q23, shift
-+ tr8_process \
-+ \d0, \d1, \d2, \d3, \d4, \d5, \d6, \d7, \
-+ \q01, \q23, \shift
-+
-+ vzip.16 \d0, \d4
-+ vzip.16 \d1, \d5
-+ vzip.16 \d2, \d6
-+ vzip.16 \d3, \d7
-+ vst4.16 {\d0-\d3}, [r0 :128], r3
-+ vst4.16 {\d4-\d7}, [r2 :128], r3
-+.endm
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-+.text
-+
-+.align 4
-+tr4f:
-+.word 0x00240053 // 36 and d1[0] = 83
-+.word 0x00000000
-+tr8f:
-+.word 0x0059004b // 89, d0[0] = 75
-+.word 0x00320012 // 50, d0[2] = 18
-+tr16:
-+.word 0x005a0057 // 90, d2[0] = 87
-+.word 0x00500046 // 80, d2[2] = 70
-+.word 0x0039002b // 57, d2[0] = 43
-+.word 0x00190009 // 25, d2[2] = 9
-+
-+#undef BIT_DEPTH
-+#define BIT_DEPTH 10
-+#include "rpi_hevc_idct_fn_neon.S"
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_init_arm.c b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-new file mode 100644
-index 0000000000..109fa98c29
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_arm.c
-@@ -0,0 +1,32 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+
-+av_cold void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth)
-+{
-+ int cpu_flags = av_get_cpu_flags();
-+
-+ if (have_neon(cpu_flags))
-+ ff_hevcdsp_rpi_init_neon(c, bit_depth);
-+}
-diff --git a/libavcodec/arm/rpi_hevcdsp_init_neon.c b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-new file mode 100644
-index 0000000000..9294ab8010
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_init_neon.c
-@@ -0,0 +1,467 @@
-+/*
-+ * Copyright (c) 2014 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/attributes.h"
-+#include "libavutil/arm/cpu.h"
-+#include "libavcodec/rpi_hevcdsp.h"
-+#include "rpi_hevcdsp_arm.h"
-+#include "libavcodec/avcodec.h"
-+#include "libavcodec/bit_depth_template.c"
-+
-+// NEON inter pred fns for qpel & epel (non-sand) exist in the git repo but
-+// have been removed from head as we never use them.
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_8(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_v_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+void ff_hevc_rpi_h_loop_filter_luma_neon_10(uint8_t *_pix, ptrdiff_t _stride, int _beta, int *_tc, uint8_t *_no_p, uint8_t *_no_q);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_8(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+ uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_8(uint8_t * src, unsigned int stride, uint32_t tc4,
-+ unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_8(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+ uint8_t * src_l,
-+ unsigned int no_f);
-+
-+void ff_hevc_rpi_h_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_luma2_neon_10(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+ uint8_t * _pix_l);
-+void ff_hevc_rpi_h_loop_filter_uv_neon_10(uint8_t * src, unsigned int stride, uint32_t tc4,
-+ unsigned int no_f);
-+void ff_hevc_rpi_v_loop_filter_uv2_neon_10(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+ uint8_t * src_l,
-+ unsigned int no_f);
-+
-+void ff_hevc_rpi_transform_4x4_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_8(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_8(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_8(int16_t *coeffs);
-+
-+void ff_hevc_rpi_transform_4x4_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_transform_8x8_neon_10(int16_t *coeffs, int col_limit);
-+void ff_hevc_rpi_idct_4x4_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_8x8_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_16x16_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_idct_32x32_dc_neon_10(int16_t *coeffs);
-+void ff_hevc_rpi_transform_luma_4x4_neon_10(int16_t *coeffs);
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_8(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_8(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_neon_10(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_neon_10(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_neon_10(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_32x32_neon_10(uint8_t *_dst, int16_t *coeffs,
-+ ptrdiff_t stride);
-+
-+void ff_hevc_rpi_add_residual_4x4_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+void ff_hevc_rpi_add_residual_32x32_dc_neon_10(uint8_t *_dst, ptrdiff_t stride, int dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_8(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+
-+void ff_hevc_rpi_add_residual_4x4_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_8x8_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_16x16_u_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_v);
-+void ff_hevc_rpi_add_residual_4x4_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_8x8_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_16x16_v_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride, int dc_u);
-+void ff_hevc_rpi_add_residual_4x4_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_8x8_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_16x16_c_neon_10(uint8_t *_dst, const int16_t * residual,
-+ ptrdiff_t stride);
-+void ff_hevc_rpi_add_residual_4x4_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_8x8_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+void ff_hevc_rpi_add_residual_16x16_dc_c_neon_10(uint8_t *_dst, ptrdiff_t stride, int32_t dc);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_edge_c_8_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_16_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height);
-+void ff_hevc_rpi_sao_edge_c_32_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_8(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_8(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_8(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+
-+void ff_hevc_rpi_sao_band_c_8_neon_10(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+void ff_hevc_rpi_sao_band_c_16_neon_10(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+void ff_hevc_rpi_sao_band_c_32_neon_10(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+void ff_hevc_rpi_sao_band_8_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_16_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_32_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+void ff_hevc_rpi_sao_band_64_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+
-+
-+uint32_t ff_hevc_rpi_deblocking_boundary_strengths_neon(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ int in_inc0, int in_inc1);
-+void ff_hevc_rpi_cpy_blks8x4_neon(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height);
-+
-+
-+static void ff_hevc_rpi_sao_edge_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+ ff_hevc_rpi_sao_edge_32_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+ ff_hevc_rpi_sao_edge_16_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+static void ff_hevc_rpi_sao_edge_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+ ff_hevc_rpi_sao_edge_32_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 32, height);
-+ ff_hevc_rpi_sao_edge_16_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val, eo, 16, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_48_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+ ff_hevc_rpi_sao_band_32_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+ ff_hevc_rpi_sao_band_16_neon_8(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+static void ff_hevc_rpi_sao_band_48_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+ ff_hevc_rpi_sao_band_32_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 32, height);
-+ ff_hevc_rpi_sao_band_16_neon_10(_dst + 64, _src + 64, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+}
-+
-+#if SAO_FILTER_N == 6
-+static void ff_hevc_rpi_sao_edge_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+ ff_hevc_rpi_sao_edge_16_neon_8(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+ ff_hevc_rpi_sao_edge_8_neon_8(_dst + 16, _src + 16, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *_sao_offset_val, int eo, int width, int height)
-+{
-+ ff_hevc_rpi_sao_edge_16_neon_10(_dst, _src, stride_dst, _sao_offset_val, eo, 16, height);
-+ ff_hevc_rpi_sao_edge_8_neon_10(_dst + 32, _src + 32, stride_dst, _sao_offset_val, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_24_neon_8(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+ ff_hevc_rpi_sao_band_16_neon_8(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+ ff_hevc_rpi_sao_band_8_neon_8(_dst + 16, _src + 16, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_24_neon_10(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height)
-+{
-+ ff_hevc_rpi_sao_band_16_neon_10(_dst, _src, stride_dst, stride_src, sao_offset_val, sao_left_class, 16, height);
-+ ff_hevc_rpi_sao_band_8_neon_10(_dst + 32, _src + 32, stride_dst, stride_src, sao_offset_val, sao_left_class, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_edge_c_24_neon_8(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height)
-+{
-+ ff_hevc_rpi_sao_edge_c_16_neon_8(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+ ff_hevc_rpi_sao_edge_c_8_neon_8(_dst + 32, _src + 32, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+static void ff_hevc_rpi_sao_edge_c_24_neon_10(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *_sao_offset_val_u, const int16_t *_sao_offset_val_v,
-+ int eo, int width, int height)
-+{
-+ ff_hevc_rpi_sao_edge_c_16_neon_10(_dst, _src, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 16, height);
-+ ff_hevc_rpi_sao_edge_c_8_neon_10(_dst + 64, _src + 64, stride_dst, _sao_offset_val_u, _sao_offset_val_v, eo, 8, height);
-+}
-+
-+static void ff_hevc_rpi_sao_band_c_24_neon_8(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height)
-+{
-+ ff_hevc_rpi_sao_band_c_16_neon_8(_dst, _src, stride_dst, stride_src,
-+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+ ff_hevc_rpi_sao_band_c_8_neon_8(_dst + 32, _src + 32, stride_dst, stride_src,
-+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+static void ff_hevc_rpi_sao_band_c_24_neon_10(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height)
-+{
-+ ff_hevc_rpi_sao_band_c_16_neon_10(_dst, _src, stride_dst, stride_src,
-+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 16, height);
-+ ff_hevc_rpi_sao_band_c_8_neon_10(_dst + 64, _src + 64, stride_dst, stride_src,
-+ sao_offset_val_u, sao_left_class_u, sao_offset_val_v, sao_left_class_v, 8, height);
-+}
-+#endif
-+
-+
-+
-+#if RPI_HEVC_SAO_BUF_STRIDE != 160
-+#error SAO edge src stride not 160 - value used in .S
-+#endif
-+
-+av_cold void ff_hevcdsp_rpi_init_neon(HEVCDSPContext *c, const int bit_depth)
-+{
-+ if (bit_depth == 8) {
-+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_8;
-+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_8;
-+ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_8;
-+ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_8;
-+ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_8;
-+ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_8;
-+ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_8;
-+ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_8;
-+ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_8;
-+ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_8;
-+ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_8;
-+ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_8;
-+ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_8;
-+ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_8;
-+ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_8;
-+ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_8;
-+ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_8;
-+ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_8;
-+ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_8;
-+ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_8;
-+ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_8;
-+ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_8;
-+ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_8;
-+ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_8;
-+ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_8;
-+ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_8;
-+ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_8;
-+ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_8;
-+ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_8;
-+ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_8;
-+ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_8;
-+ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_8;
-+ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_8;
-+ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_8;
-+ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_8;
-+ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_8;
-+ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_8;
-+ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_8;
-+ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_8;
-+ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_8;
-+ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_8;
-+ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_8;
-+ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_8;
-+#if SAO_FILTER_N == 6
-+ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_8;
-+ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_8;
-+#endif
-+ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_8;
-+ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_8;
-+ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_8;
-+
-+ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_8;
-+ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_8;
-+ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_8;
-+
-+#if SAO_FILTER_N == 6
-+ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_8;
-+ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_8;
-+#endif
-+ }
-+ else if (bit_depth == 10) {
-+ c->hevc_v_loop_filter_luma = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+ c->hevc_v_loop_filter_luma_c = ff_hevc_rpi_v_loop_filter_luma_neon_10;
-+ c->hevc_h_loop_filter_luma = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+ c->hevc_h_loop_filter_luma_c = ff_hevc_rpi_h_loop_filter_luma_neon_10;
-+ c->hevc_h_loop_filter_luma2 = ff_hevc_rpi_h_loop_filter_luma2_neon_10;
-+ c->hevc_v_loop_filter_luma2 = ff_hevc_rpi_v_loop_filter_luma2_neon_10;
-+ c->hevc_h_loop_filter_uv = ff_hevc_rpi_h_loop_filter_uv_neon_10;
-+ c->hevc_v_loop_filter_uv2 = ff_hevc_rpi_v_loop_filter_uv2_neon_10;
-+ c->idct[0] = ff_hevc_rpi_transform_4x4_neon_10;
-+ c->idct[1] = ff_hevc_rpi_transform_8x8_neon_10;
-+ c->idct_dc[0] = ff_hevc_rpi_idct_4x4_dc_neon_10;
-+ c->idct_dc[1] = ff_hevc_rpi_idct_8x8_dc_neon_10;
-+ c->idct_dc[2] = ff_hevc_rpi_idct_16x16_dc_neon_10;
-+ c->idct_dc[3] = ff_hevc_rpi_idct_32x32_dc_neon_10;
-+ c->add_residual[0] = ff_hevc_rpi_add_residual_4x4_neon_10;
-+ c->add_residual[1] = ff_hevc_rpi_add_residual_8x8_neon_10;
-+ c->add_residual[2] = ff_hevc_rpi_add_residual_16x16_neon_10;
-+ c->add_residual[3] = ff_hevc_rpi_add_residual_32x32_neon_10;
-+ c->add_residual_dc[0] = ff_hevc_rpi_add_residual_4x4_dc_neon_10;
-+ c->add_residual_dc[1] = ff_hevc_rpi_add_residual_8x8_dc_neon_10;
-+ c->add_residual_dc[2] = ff_hevc_rpi_add_residual_16x16_dc_neon_10;
-+ c->add_residual_dc[3] = ff_hevc_rpi_add_residual_32x32_dc_neon_10;
-+ c->add_residual_u[0] = ff_hevc_rpi_add_residual_4x4_u_neon_10;
-+ c->add_residual_u[1] = ff_hevc_rpi_add_residual_8x8_u_neon_10;
-+ c->add_residual_u[2] = ff_hevc_rpi_add_residual_16x16_u_neon_10;
-+ c->add_residual_v[0] = ff_hevc_rpi_add_residual_4x4_v_neon_10;
-+ c->add_residual_v[1] = ff_hevc_rpi_add_residual_8x8_v_neon_10;
-+ c->add_residual_v[2] = ff_hevc_rpi_add_residual_16x16_v_neon_10;
-+ c->add_residual_c[0] = ff_hevc_rpi_add_residual_4x4_c_neon_10;
-+ c->add_residual_c[1] = ff_hevc_rpi_add_residual_8x8_c_neon_10;
-+ c->add_residual_c[2] = ff_hevc_rpi_add_residual_16x16_c_neon_10;
-+ c->add_residual_dc_c[0] = ff_hevc_rpi_add_residual_4x4_dc_c_neon_10;
-+ c->add_residual_dc_c[1] = ff_hevc_rpi_add_residual_8x8_dc_c_neon_10;
-+ c->add_residual_dc_c[2] = ff_hevc_rpi_add_residual_16x16_dc_c_neon_10;
-+ c->transform_4x4_luma = ff_hevc_rpi_transform_luma_4x4_neon_10;
-+ c->sao_band_filter[0] = ff_hevc_rpi_sao_band_8_neon_10;
-+ c->sao_band_filter[1] = ff_hevc_rpi_sao_band_16_neon_10;
-+ c->sao_band_filter[2] = ff_hevc_rpi_sao_band_32_neon_10;
-+ c->sao_band_filter[3] = ff_hevc_rpi_sao_band_48_neon_10;
-+ c->sao_band_filter[4] = ff_hevc_rpi_sao_band_64_neon_10;
-+
-+ c->sao_edge_filter[0] = ff_hevc_rpi_sao_edge_8_neon_10;
-+ c->sao_edge_filter[1] = ff_hevc_rpi_sao_edge_16_neon_10;
-+ c->sao_edge_filter[2] = ff_hevc_rpi_sao_edge_32_neon_10;
-+ c->sao_edge_filter[3] = ff_hevc_rpi_sao_edge_48_neon_10;
-+ c->sao_edge_filter[4] = ff_hevc_rpi_sao_edge_64_neon_10;
-+#if SAO_FILTER_N == 6
-+ c->sao_band_filter[5] = ff_hevc_rpi_sao_band_24_neon_10;
-+ c->sao_edge_filter[5] = ff_hevc_rpi_sao_edge_24_neon_10;
-+#endif
-+ c->sao_band_filter_c[0] = ff_hevc_rpi_sao_band_c_8_neon_10;
-+ c->sao_band_filter_c[1] = ff_hevc_rpi_sao_band_c_16_neon_10;
-+ c->sao_band_filter_c[2] = ff_hevc_rpi_sao_band_c_32_neon_10;
-+
-+ c->sao_edge_filter_c[0] = ff_hevc_rpi_sao_edge_c_8_neon_10;
-+ c->sao_edge_filter_c[1] = ff_hevc_rpi_sao_edge_c_16_neon_10;
-+ c->sao_edge_filter_c[2] = ff_hevc_rpi_sao_edge_c_32_neon_10;
-+
-+#if SAO_FILTER_N == 6
-+ c->sao_band_filter_c[5] = ff_hevc_rpi_sao_band_c_24_neon_10;
-+ c->sao_edge_filter_c[5] = ff_hevc_rpi_sao_edge_c_24_neon_10;
-+#endif
-+ }
-+
-+ assert(offsetof(HEVCRpiMvField, mv) == 0);
-+ assert(offsetof(HEVCRpiMvField, ref_idx) == 8);
-+ assert(offsetof(HEVCRpiMvField, pred_flag) == 10);
-+ c->hevc_deblocking_boundary_strengths = ff_hevc_rpi_deblocking_boundary_strengths_neon;
-+ c->cpy_blk = ff_hevc_rpi_cpy_blks8x4_neon;
-+}
-diff --git a/libavcodec/arm/rpi_hevcdsp_res16_neon.S b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-new file mode 100644
-index 0000000000..f831e55a6d
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res16_neon.S
-@@ -0,0 +1,591 @@
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+#define BIT_DEPTH 10
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+ vmax.s16 \Q0, \Q_MIN
-+ vmax.s16 \Q1, \Q_MIN
-+ vmax.s16 \Q2, \Q_MIN
-+ vmax.s16 \Q3, \Q_MIN
-+ vmin.s16 \Q0, \Q_MAX
-+ vmin.s16 \Q1, \Q_MAX
-+ vmin.s16 \Q2, \Q_MAX
-+ vmin.s16 \Q3, \Q_MAX
-+.endm
-+
-+@ add_residual4x4(
-+@ uint16_t *_dst, [r0]
-+@ int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_neon_, BIT_DEPTH), export=1
-+ add ip, r0, r2
-+ vld1.16 {q10, q11}, [r1]
-+ lsl r2, #1
-+ vld1.16 {d0}, [r0 :64], r2
-+ vld1.16 {d1}, [ip :64], r2
-+ vld1.16 {d2}, [r0 :64]
-+ vld1.16 {d3}, [ip :64]
-+ sub r0, r2
-+ vqadd.s16 q0, q10
-+ sub ip, r2
-+ vqadd.s16 q1, q11
-+ vmov.i16 q8, #0
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ vmax.s16 q0, q0, q8
-+ vmax.s16 q1, q1, q8
-+ vmin.s16 q0, q0, q9
-+ vmin.s16 q1, q1, q9
-+ vst1.16 {d0}, [r0 :64], r2
-+ vst1.16 {d1}, [ip :64], r2
-+ vst1.16 {d2}, [r0 :64]
-+ vst1.16 {d3}, [ip :64]
-+ bx lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_neon_, BIT_DEPTH), export=1
-+ add ip, r0, r1
-+ vdup.16 q15, r2
-+ lsl r1, #1
-+ vld1.16 {d0}, [r0 :64], r1
-+ vld1.16 {d1}, [ip :64], r1
-+ vld1.16 {d2}, [r0 :64]
-+ vld1.16 {d3}, [ip :64]
-+ sub r0, r1
-+ vqadd.s16 q0, q15
-+ sub ip, r1
-+ vqadd.s16 q1, q15
-+ vmov.i16 q8, #0
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ vmax.s16 q0, q0, q8
-+ vmax.s16 q1, q1, q8
-+ vmin.s16 q0, q0, q9
-+ vmin.s16 q1, q1, q9
-+ vst1.16 {d0}, [r0 :64], r1
-+ vst1.16 {d1}, [ip :64], r1
-+ vst1.16 {d2}, [r0 :64]
-+ vst1.16 {d3}, [ip :64]
-+ bx lr
-+
-+endfunc
-+
-+
-+@ add_residual8x8(
-+@ uint16_t *_dst, [r0]
-+@ int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_neon_, BIT_DEPTH), export=1
-+ mov r3, #8
-+ vmov.i64 q8, #0
-+ add ip, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ lsl r2, #1
-+1:
-+ vldm r1!, {q10-q13}
-+ vld1.16 {q0}, [r0 :128], r2
-+ vld1.16 {q1}, [ip :128], r2
-+ vld1.16 {q2}, [r0 :128]
-+ vld1.16 {q3}, [ip :128]
-+ sub r0, r2
-+ vqadd.s16 q0, q10
-+ sub ip, r2
-+ vqadd.s16 q1, q11
-+ subs r3, #4
-+ vqadd.s16 q2, q12
-+ vqadd.s16 q3, q13
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst1.16 {q0}, [r0 :128], r2
-+ vst1.16 {q1}, [ip :128], r2
-+ vst1.16 {q2}, [r0 :128], r2
-+ vst1.16 {q3}, [ip :128], r2
-+ bne 1b
-+ bx lr
-+
-+endfunc
-+
-+@ add_residual4x4_dc_c(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc_uv) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_dc_c_neon_, BIT_DEPTH), export=1
-+ mov r3, #4
-+ vdup.32 q15, r2
-+ b 9f
-+endfunc
-+
-+@ add_residual8x8_dc(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_neon_, BIT_DEPTH), export=1
-+ vdup.16 q15, r2
-+ mov r3, #8
-+9:
-+ vmov.i16 q8, #0
-+ add ip, r0, r1
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ lsl r1, #1
-+1:
-+ vld1.16 {q0}, [r0 :128], r1
-+ vld1.16 {q1}, [ip :128], r1
-+ vld1.16 {q2}, [r0 :128]
-+ vld1.16 {q3}, [ip :128]
-+ sub r0, r1
-+ vqadd.s16 q0, q15
-+ sub ip, r1
-+ vqadd.s16 q1, q15
-+ subs r3, #4
-+ vqadd.s16 q2, q15
-+ vqadd.s16 q3, q15
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst1.16 {q0}, [r0 :128], r1
-+ vst1.16 {q1}, [ip :128], r1
-+ vst1.16 {q2}, [r0 :128], r1
-+ vst1.16 {q3}, [ip :128], r1
-+ bne 1b
-+ bx lr
-+
-+endfunc
-+
-+@ add_residual16x16(
-+@ uint16_t *_dst, [r0]
-+@ int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_neon_, BIT_DEPTH), export=1
-+ add ip, r0, r2
-+ vmov.i16 q8, #0
-+ lsl r2, #1
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ mov r3, #16
-+1:
-+ vldm r1!, {q10-q13}
-+ @ For RPI Sand we could guarantee :256 but not for general
-+ @ non-RPI allocation. :128 is as good as we can claim
-+ vld1.16 {q0, q1}, [r0 :128]
-+ subs r3, #2
-+ vld1.16 {q2, q3}, [ip :128]
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q1, q11
-+ vqadd.s16 q2, q12
-+ vqadd.s16 q3, q13
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst1.16 {q0, q1}, [r0 :128], r2
-+ vst1.16 {q2, q3}, [ip :128], r2
-+ bne 1b
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_dc_c(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc_uv) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_dc_c_neon_, BIT_DEPTH), export=1
-+ mov r3, #8
-+ vdup.32 q15, r2
-+ b 9f
-+endfunc
-+
-+@ add_residual16x16_dc(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_neon_, BIT_DEPTH), export=1
-+ vdup.i16 q15, r2
-+ mov r3, #16
-+9:
-+ vmov.i16 q8, #0
-+ add ip, r0, r1
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ lsl r1, #1
-+1:
-+ @ For RPI Sand we could guarantee :256 but not for general
-+ @ non-RPI allocation. :128 is as good as we can claim
-+ vld1.16 {q0, q1}, [r0 :128]
-+ subs r3, #2
-+ vqadd.s16 q0, q15
-+ vqadd.s16 q1, q15
-+ vld1.16 {q2, q3}, [ip :128]
-+ vqadd.s16 q2, q15
-+ vqadd.s16 q3, q15
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst1.16 {q0, q1}, [r0 :128], r1
-+ vst1.16 {q2, q3}, [ip :128], r1
-+ bne 1b
-+ bx lr
-+
-+endfunc
-+
-+
-+@ add_residual32x32(
-+@ uint16_t *_dst, [r0]
-+@ int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_neon_, BIT_DEPTH), export=1
-+ push {lr}
-+ mov r3, #32
-+ vmov.i16 q8, #0
-+ add lr, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ add ip, r0, #32
-+1:
-+ vldm r1!, {q10-q13}
-+ vldm r0, {q0-q3}
-+ vqadd.s16 q0, q10
-+ pldw [lr]
-+ vqadd.s16 q1, q11
-+ add lr, r2
-+ vqadd.s16 q2, q12
-+ subs r3, #1
-+ vqadd.s16 q3, q13
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst1.16 {q0-q1}, [r0], r2
-+ vst1.16 {q2-q3}, [ip], r2
-+ bne 1b
-+ pop {pc}
-+
-+endfunc
-+
-+@ add_residual16x16_dc_c(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc_uv) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_dc_c_neon_, BIT_DEPTH), export=1
-+ mov r3, #16
-+ vdup.32 q15, r2
-+ b 9f
-+endfunc
-+
-+@ add_residual32x32_dc(
-+@ uint16_t *_dst, [r0]
-+@ ptrdiff_t stride, [r1]
-+@ int dc) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_32x32_dc_neon_, BIT_DEPTH), export=1
-+ vdup.16 q15, r2
-+ mov r3, #32
-+9:
-+ vmov.i16 q8, #0
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ add ip, r0, #32
-+1:
-+ vldm r0, {q0-q3}
-+ vqadd.s16 q0, q15
-+ subs r3, #1
-+ vqadd.s16 q1, q15
-+ vqadd.s16 q2, q15
-+ vqadd.s16 q3, q15
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst1.16 {q0-q1}, [r0], r1
-+ vst1.16 {q2-q3}, [ip], r1
-+ bne 1b
-+ bx lr
-+
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_u(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc) [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_u_neon_, BIT_DEPTH), export=1
-+ vdup.16 q15, r3
-+ add ip, r0, r2
-+ vld1.16 {q10, q11}, [r1 :256]
-+ lsl r2, #1
-+ vld2.16 {d0, d2}, [r0 :128], r2
-+ vld2.16 {d1, d3}, [ip :128], r2
-+ vld2.16 {d4, d6}, [r0 :128]
-+ vld2.16 {d5, d7}, [ip :128]
-+ sub r0, r2
-+ vmov.i16 q8, #0
-+ sub ip, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q1, q15
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q3, q15
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+
-+ vst2.16 {d0, d2}, [r0 :128], r2
-+ vst2.16 {d1, d3}, [ip :128], r2
-+ vst2.16 {d4, d6}, [r0 :128]
-+ vst2.16 {d5, d7}, [ip :128]
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_u(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc) [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_u_neon_, BIT_DEPTH), export=1
-+ vdup.16 q15, r3
-+ mov r3, #8
-+ vmov.i16 q8, #0
-+ add ip, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ lsl r2, #1
-+1:
-+ vld2.16 {q0, q1}, [r0 :256]
-+ subs r3, #2
-+ vld2.16 {q2, q3}, [ip :256]
-+ vld1.16 {q10, q11}, [r1 :256]!
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q1, q15
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q3, q15
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst2.16 {q0, q1}, [r0 :256], r2
-+ vst2.16 {q2, q3}, [ip :256], r2
-+ bne 1b
-+ bx lr
-+endfunc
-+
-+@ add_residual16x16_u(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc) [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_u_neon_, BIT_DEPTH), export=1
-+ push {lr}
-+ vdup.16 q15, r3
-+ mov r3, #16
-+ vmov.i16 q8, #0
-+ add lr, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ add ip, r0, #32
-+1:
-+ vld2.16 {q0, q1}, [r0 :256]
-+ vld2.16 {q2, q3}, [ip :256]
-+ vld1.16 {q10, q11}, [r1 :256]!
-+ vqadd.s16 q0, q10
-+ pldw [lr]
-+ vqadd.s16 q1, q15
-+ add lr, r2
-+ vqadd.s16 q2, q11
-+ subs r3, #1
-+ vqadd.s16 q3, q15
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst2.16 {q0, q1}, [r0 :256], r2
-+ vst2.16 {q2, q3}, [ip :256], r2
-+ bne 1b
-+ pop {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc) [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_v_neon_, BIT_DEPTH), export=1
-+ vdup.16 q15, r3
-+ add ip, r0, r2
-+ vld1.16 {q10, q11}, [r1 :256]
-+ lsl r2, #1
-+ vld2.16 {d0, d2}, [r0 :128], r2
-+ vld2.16 {d1, d3}, [ip :128], r2
-+ vld2.16 {d4, d6}, [r0 :128]
-+ vld2.16 {d5, d7}, [ip :128]
-+ sub r0, r2
-+ vmov.i16 q8, #0
-+ sub ip, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+
-+ vqadd.s16 q0, q15
-+ vqadd.s16 q1, q10
-+ vqadd.s16 q2, q15
-+ vqadd.s16 q3, q11
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+
-+ vst2.16 {d0, d2}, [r0 :128], r2
-+ vst2.16 {d1, d3}, [ip :128], r2
-+ vst2.16 {d4, d6}, [r0 :128]
-+ vst2.16 {d5, d7}, [ip :128]
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc) [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_v_neon_, BIT_DEPTH), export=1
-+ vdup.16 q15, r3
-+ mov r3, #8
-+ vmov.i16 q8, #0
-+ add ip, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ lsl r2, #1
-+1:
-+ vld2.16 {q0, q1}, [r0 :256]
-+ subs r3, #2
-+ vld2.16 {q2, q3}, [ip :256]
-+ vld1.16 {q10, q11}, [r1 :256]!
-+ vqadd.s16 q0, q15
-+ vqadd.s16 q1, q10
-+ vqadd.s16 q2, q15
-+ vqadd.s16 q3, q11
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst2.16 {q0, q1}, [r0 :256], r2
-+ vst2.16 {q2, q3}, [ip :256], r2
-+ bne 1b
-+ bx lr
-+endfunc
-+
-+@ add_residual16x16_v(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc) [r3]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_v_neon_, BIT_DEPTH), export=1
-+ push {lr}
-+ vdup.16 q15, r3
-+ mov r3, #16
-+ vmov.i16 q8, #0
-+ add lr, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ add ip, r0, #32
-+1:
-+ vld2.16 {q0, q1}, [r0 :256]
-+ vld2.16 {q2, q3}, [ip :256]
-+ vld1.16 {q10, q11}, [r1 :256]!
-+ vqadd.s16 q0, q15
-+ pldw [lr]
-+ vqadd.s16 q1, q10
-+ add lr, r2
-+ vqadd.s16 q2, q15
-+ subs r3, #1
-+ vqadd.s16 q3, q11
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst2.16 {q0, q1}, [r0 :256], r2
-+ vst2.16 {q2, q3}, [ip :256], r2
-+ bne 1b
-+ pop {pc}
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_4x4_c_neon_, BIT_DEPTH), export=1
-+ vmov.i16 q8, #0
-+ add ip, r0, r2
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ lsl r2, #1
-+ vldm r1, {q10-q13}
-+ vld2.16 {d0, d2}, [r0 :128], r2
-+ vld2.16 {d1, d3}, [ip :128], r2
-+ vld2.16 {d4, d6}, [r0 :128]
-+ vld2.16 {d5, d7}, [ip :128]
-+
-+ sub r0, r2
-+ vqadd.s16 q0, q10
-+ sub ip, r2
-+ vqadd.s16 q1, q12
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q3, q13
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+
-+ vst2.16 {d0, d2}, [r0 :128], r2
-+ vst2.16 {d1, d3}, [ip :128], r2
-+ vst2.16 {d4, d6}, [r0 :128]
-+ vst2.16 {d5, d7}, [ip :128]
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_8x8_c_neon_, BIT_DEPTH), export=1
-+ push {lr}
-+ add ip, r0, r2
-+ lsl r2, #1
-+ vmov.i16 q8, #0
-+ add r3, r1, #(8*8*2) @ Offset to V
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ mov lr, #8
-+1:
-+ vld1.16 {q10, q11}, [r1 :256]!
-+ subs lr, #2
-+ vld2.16 {q0, q1}, [r0 :256]
-+ vld2.16 {q2, q3}, [ip :256]
-+ vld1.16 {q12, q13}, [r3 :256]!
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q1, q12
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q3, q13
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst2.16 {q0, q1}, [r0 :256], r2
-+ vst2.16 {q2, q3}, [ip :256], r2
-+ bne 1b
-+ pop {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@ uint16_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function JOIN(ff_hevc_rpi_add_residual_16x16_c_neon_, BIT_DEPTH), export=1
-+ push {r4, lr}
-+ vmov.i16 q8, #0
-+ add r3, r1, #(16*16*2) @ Offset to V
-+ vmov.i16 q9, #(1 << BIT_DEPTH) - 1
-+ add ip, r0, #32
-+ add r4, r0, r2
-+ mov lr, #16
-+1:
-+ vld2.16 {q0, q1}, [r0 :256]
-+ vld2.16 {q2, q3}, [ip :256]
-+ vld1.16 {q10, q11}, [r1 :256]!
-+ vld1.16 {q12, q13}, [r3 :256]!
-+ vqadd.s16 q0, q10
-+ pldw [r4]
-+ vqadd.s16 q1, q12
-+ add r4, r2
-+ vqadd.s16 q2, q11
-+ subs lr, #1
-+ vqadd.s16 q3, q13
-+ clip16_4 q0, q1, q2, q3, q8, q9
-+ vst2.16 {q0, q1}, [r0 :256], r2
-+ vst2.16 {q2, q3}, [ip :256], r2
-+ bne 1b
-+ pop {r4,pc}
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcdsp_res8_neon.S b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-new file mode 100644
-index 0000000000..ea3b3faf6f
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_res8_neon.S
-@@ -0,0 +1,712 @@
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+ .arch_extension mp @ enable PLDW
-+
-+@ General notes:
-+@
-+@ Residual is generally only guaranteed to be clipped to 16 bits.
-+@ This means that we do need to do vmovl, vqadd, vqmovun
-+@ rather than vaddw, vqmovun (if we were clipped to 15 then we could get away
-+@ with this).
-+@
-+@ There is an exception for the DC case because its transform is guaranteed
-+@ to be small enough that overflow cannot occur during the first add.
-+
-+@ ============================================================================
-+@ Y add
-+
-+function ff_hevc_rpi_add_residual_4x4_neon_8, export=1
-+ add ip, r0, r2
-+ vld1.16 {q0, q1}, [r1]
-+ lsl r2, #1
-+ vld1.32 d4[0], [r0], r2
-+ rsb r3, r2, #0
-+ vld1.32 d4[1], [ip], r2
-+ vld1.32 d5[0], [r0], r3
-+ vld1.32 d5[1], [ip], r3
-+ vmovl.u8 q8, d4
-+ vmovl.u8 q9, d5
-+ vqadd.s16 q0, q8
-+ vqadd.s16 q1, q9
-+ vqmovun.s16 d0, q0
-+ vqmovun.s16 d1, q1
-+ vst1.32 d0[0], [r0], r2
-+ vst1.32 d0[1], [ip], r2
-+ vst1.32 d1[0], [r0]
-+ vst1.32 d1[1], [ip]
-+ bx lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_8x8_neon_8, export=1
-+ push {r4, lr}
-+ vld1.16 {q0, q1}, [r1]!
-+ add ip, r0, r2
-+ vld1.8 {d6}, [r0]
-+ add r4, r0, r2, lsl #1
-+ vld1.8 {d7}, [ip]
-+ add lr, ip, r2, lsl #1
-+ lsl r2, #1
-+ mov r3, #8-2
-+ vmovl.u8 q2, d6
-+ vmovl.u8 q3, d7
-+ vqadd.s16 q2, q0
-+ vqadd.s16 q3, q1
-+1:
-+ vld1.16 {q0, q1}, [r1]!
-+ subs r3, #2
-+ vqmovun.s16 d4, q2
-+ vqmovun.s16 d5, q3
-+ vld1.8 {d6}, [r4], r2
-+ vld1.8 {d7}, [lr], r2
-+ vst1.8 {d4}, [r0], r2
-+ vst1.8 {d5}, [ip], r2
-+ vmovl.u8 q2, d6
-+ pldw [r4]
-+ vmovl.u8 q3, d7
-+ vqadd.s16 q2, q0
-+ vqadd.s16 q3, q1
-+ bne 1b
-+
-+ vqmovun.s16 d4, q2
-+ vqmovun.s16 d5, q3
-+ vst1.8 {d4}, [r0]
-+ vst1.8 {d5}, [ip]
-+ pop {r4, pc}
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_16x16_neon_8, export=1
-+ vld1.16 {q0, q1}, [r1]!
-+ add ip, r0, r2
-+ vld1.8 {q3}, [r0]
-+ mov r3, #16-1
-+ vmovl.u8 q2, d6
-+ vmovl.u8 q3, d7
-+ vqadd.s16 q2, q0
-+ vqadd.s16 q3, q1
-+1:
-+ vld1.16 {q0, q1}, [r1]!
-+ subs r3, #1
-+ vqmovun.s16 d4, q2
-+ vqmovun.s16 d5, q3
-+ vld1.8 {q3}, [ip], r2
-+ vst1.8 {q2}, [r0], r2
-+ vmovl.u8 q2, d6
-+ pldw [ip]
-+ vmovl.u8 q3, d7
-+ vqadd.s16 q2, q0
-+ vqadd.s16 q3, q1
-+ bne 1b
-+
-+ vqmovun.s16 d4, q2
-+ vqmovun.s16 d5, q3
-+ vst1.8 {q2}, [r0]
-+ bx lr
-+endfunc
-+
-+function ff_hevc_rpi_add_residual_32x32_neon_8, export=1
-+ vldm r1!, {q0-q3}
-+ vld1.8 {q8, q9}, [r0]
-+ add ip, r0, r2
-+ vmovl.u8 q10, d16
-+ mov r3, #32-1
-+ vmovl.u8 q11, d17
-+ vmovl.u8 q12, d18
-+ vmovl.u8 q13, d19
-+ vqadd.s16 q10, q0
-+ vqadd.s16 q11, q1
-+ vqadd.s16 q12, q2
-+ vqadd.s16 q13, q3
-+1:
-+ vldm r1!, {q0-q3}
-+ vqmovun.s16 d20, q10
-+ vqmovun.s16 d21, q11
-+ vqmovun.s16 d22, q12
-+ vqmovun.s16 d23, q13
-+ vld1.8 {q8, q9}, [ip], r2
-+ subs r3, #1
-+ vst1.8 {q10, q11}, [r0], r2
-+ vmovl.u8 q10, d16
-+ pldw [ip]
-+ vmovl.u8 q11, d17
-+ vmovl.u8 q12, d18
-+ vmovl.u8 q13, d19
-+ vqadd.s16 q10, q0
-+ vqadd.s16 q11, q1
-+ vqadd.s16 q12, q2
-+ vqadd.s16 q13, q3
-+ bne 1b
-+
-+ vqmovun.s16 d20, q10
-+ vqmovun.s16 d21, q11
-+ vqmovun.s16 d22, q12
-+ vqmovun.s16 d23, q13
-+ vst1.8 {q10, q11}, [r0]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_neon_8, export=1
-+ add ip, r0, r1
-+ vdup.16 q15, r2
-+ lsl r1, #1
-+ vld1.32 d4[0], [r0], r1
-+ rsb r3, r1, #0
-+ vld1.32 d4[1], [ip], r1
-+ vld1.32 d5[0], [r0], r3
-+ vld1.32 d5[1], [ip], r3
-+ vaddw.u8 q0, q15, d4
-+ vaddw.u8 q1, q15, d5
-+ vqmovun.s16 d0, q0
-+ vqmovun.s16 d1, q1
-+ vst1.32 d0[0], [r0], r1
-+ vst1.32 d0[1], [ip], r1
-+ vst1.32 d1[0], [r0]
-+ vst1.32 d1[1], [ip]
-+ bx lr
-+endfunc
-+
-+@ ============================================================================
-+@ DC Y or C add
-+
-+@ ff_hevc_rpi_add_residual_4x4_dc_c_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_dc_c_neon_8, export=1
-+ mov r3, #4-2
-+ vdup.32 q15, r2
-+ b 1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_neon_8, export=1
-+ vdup.16 q15, r2
-+ mov r3, #8-2
-+1: vld1.8 d16, [r0]
-+ add ip, r0, r1
-+ push {r4, lr}
-+ vld1.8 d17, [ip]
-+ add r4, r0, r1, lsl #1
-+ vaddw.u8 q0, q15, d16
-+ lsl r1, #1
-+ vaddw.u8 q1, q15, d17
-+ add lr, ip, r1
-+1:
-+ vld1.8 {d16}, [r4], r1
-+ vld1.8 {d17}, [lr], r1
-+ subs r3, #2
-+ vqmovun.s16 d4, q0
-+ vqmovun.s16 d5, q1
-+ vaddw.u8 q0, q15, d16
-+ vaddw.u8 q1, q15, d17
-+ vst1.8 {d4}, [r0], r1
-+ vst1.8 {d5}, [ip], r1
-+ bne 1b
-+
-+ vqmovun.s16 d4, q0
-+ vqmovun.s16 d5, q1
-+ vst1.8 {d4}, [r0]
-+ vst1.8 {d5}, [ip]
-+ pop {r4, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_8x8_dc_c_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_dc_c_neon_8, export=1
-+ mov r3, #8-1
-+ vdup.32 q15, r2
-+ b 1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_neon_8, export=1
-+ vdup.16 q15, r2
-+ mov r3, #16-1
-+1: vld1.8 {q8}, [r0]
-+ add ip, r0, r1
-+ vaddw.u8 q0, q15, d16
-+ vaddw.u8 q1, q15, d17
-+1:
-+ vld1.8 {q8}, [ip], r1
-+ subs r3, #1
-+ vqmovun.s16 d4, q0
-+ vqmovun.s16 d5, q1
-+ vaddw.u8 q0, q15, d16
-+ vaddw.u8 q1, q15, d17
-+ vst1.8 {q2}, [r0], r1
-+ bne 1b
-+
-+ vqmovun.s16 d4, q0
-+ vqmovun.s16 d5, q1
-+ vst1.8 {q2}, [r0]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_add_residual_16x16_dc_c_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_dc_c_neon_8, export=1
-+ mov r3, #16-1
-+ vdup.32 q15, r2
-+ b 1f
-+endfunc
-+
-+@ ff_hevc_rpi_add_residual_32x32_dc_neon_8(
-+@ uint8_t * dst, // [r0]
-+@ unsigned int stride, // [r1]
-+@ int dc) // [r2]
-+
-+function ff_hevc_rpi_add_residual_32x32_dc_neon_8, export=1
-+ vdup.16 q15, r2
-+ mov r3, #32-1
-+1: vld1.8 {q8, q9}, [r0]
-+ add ip, r0, r1
-+ vaddw.u8 q0, q15, d16
-+ vaddw.u8 q1, q15, d17
-+ vaddw.u8 q2, q15, d18
-+ vaddw.u8 q3, q15, d19
-+1:
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d21, q1
-+ vqmovun.s16 d22, q2
-+ vqmovun.s16 d23, q3
-+ vld1.8 {q8, q9}, [ip], r1
-+ subs r3, #1
-+ vaddw.u8 q0, q15, d16
-+ vaddw.u8 q1, q15, d17
-+ vaddw.u8 q2, q15, d18
-+ vaddw.u8 q3, q15, d19
-+ vst1.8 {q10, q11}, [r0], r1
-+ bne 1b
-+
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d21, q1
-+ vqmovun.s16 d22, q2
-+ vqmovun.s16 d23, q3
-+ vst1.8 {q10, q11}, [r0]
-+ bx lr
-+endfunc
-+
-+@ ============================================================================
-+@ U add
-+
-+@ add_residual4x4_c(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride, [r2]
-+@ int dc_v) [r3]
-+
-+function ff_hevc_rpi_add_residual_4x4_u_neon_8, export=1
-+ add ip, r0, r2
-+ vld1.16 {q0, q1}, [r1]
-+ lsl r2, #1
-+ vld1.8 {d16}, [r0 :64], r2
-+ vld1.8 {d17}, [ip :64], r2
-+ vld1.8 {d18}, [r0 :64]
-+ sub r0, r2
-+ vld1.8 {d19}, [ip :64]
-+ sub ip, r2
-+ vdup.16 q2, r3
-+ vdup.16 q3, r3
-+ vmovl.u8 q10, d16
-+ vmovl.u8 q11, d17
-+ vmovl.u8 q12, d18
-+ vmovl.u8 q13, d19
-+ vzip.16 q0, q2
-+ vzip.16 q1, q3
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q1, q12
-+ vqadd.s16 q3, q13
-+ vqmovun.s16 d0, q0
-+ vqmovun.s16 d1, q2
-+ vqmovun.s16 d2, q1
-+ vqmovun.s16 d3, q3
-+ vst1.8 {d0}, [r0 :64], r2
-+ vst1.8 {d1}, [ip :64], r2
-+ vst1.8 {d2}, [r0 :64]
-+ vst1.8 {d3}, [ip :64]
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+@ int dc_v) [r3]
-+
-+function ff_hevc_rpi_add_residual_8x8_u_neon_8, export=1
-+ vdup.16 q15, r3
-+ add ip, r0, r2
-+ push {r4, lr}
-+ vld2.8 {d16, d17}, [r0 :128]
-+ lsl r2, #1
-+ vld2.8 {d18, d19}, [ip :128]
-+ mov r3, #8-2
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ add r4, r0, r2
-+ vmovl.u8 q10, d16
-+ add lr, ip, r2
-+ vmovl.u8 q11, d18
-+ vqadd.s16 q0, q10
-+ vaddw.u8 q2, q15, d17
-+ vqadd.s16 q1, q11
-+ vaddw.u8 q3, q15, d19
-+1:
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d21, q2
-+ vld2.8 {d16, d17}, [r4 :128], r2
-+ subs r3, #2
-+ vqmovun.s16 d22, q1
-+ vqmovun.s16 d23, q3
-+ vst2.8 {d20, d21}, [r0 :128], r2
-+ vld2.8 {d18, d19}, [lr :128], r2
-+ vst2.8 {d22, d23}, [ip :128], r2
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vmovl.u8 q10, d16
-+ vmovl.u8 q11, d18
-+ vqadd.s16 q0, q10
-+ vaddw.u8 q2, q15, d17
-+ vqadd.s16 q1, q11
-+ vaddw.u8 q3, q15, d19
-+ bne 1b
-+
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d21, q2
-+ vqmovun.s16 d22, q1
-+ vqmovun.s16 d23, q3
-+ vst2.8 {d20, d21}, [r0 :128]
-+ vst2.8 {d22, d23}, [ip :128]
-+ pop {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_u(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+@ int dc_v) [r3]
-+
-+function ff_hevc_rpi_add_residual_16x16_u_neon_8, export=1
-+ vdup.16 q15, r3
-+ add ip, r0, r2
-+ vld2.8 {q8, q9}, [r0 :256]
-+ mov r3, #16-1
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vmovl.u8 q11, d16
-+ vmovl.u8 q12, d17
-+ vqadd.s16 q0, q11
-+ vaddw.u8 q11, q15, d18
-+ vqadd.s16 q1, q12
-+ vaddw.u8 q12, q15, d19
-+1:
-+ vld2.8 {q8, q9}, [ip :256], r2
-+ subs r3, #1
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d22, q11
-+ vqmovun.s16 d21, q1
-+ vqmovun.s16 d23, q12
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vst2.8 {q10, q11}, [r0 :256], r2
-+ vmovl.u8 q11, d16
-+ pldw [ip]
-+ vmovl.u8 q12, d17
-+ vqadd.s16 q0, q11
-+ vaddw.u8 q11, q15, d18
-+ vqadd.s16 q1, q12
-+ vaddw.u8 q12, q15, d19
-+ bne 1b
-+
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d22, q11
-+ vqmovun.s16 d21, q1
-+ vqmovun.s16 d23, q12
-+ vst2.8 {q10, q11}, [r0 :256]
-+ bx lr
-+endfunc
-+
-+@ ============================================================================
-+@ V add
-+
-+@ add_residual4x4_v(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_v_neon_8, export=1
-+ add ip, r0, r2
-+ vld1.16 {q2, q3}, [r1]
-+ lsl r2, #1
-+ vld1.8 {d16}, [r0 :64], r2
-+ vld1.8 {d17}, [ip :64], r2
-+ vld1.8 {d18}, [r0 :64]
-+ sub r0, r2
-+ vld1.8 {d19}, [ip :64]
-+ sub ip, r2
-+ vdup.16 q0, r3
-+ vdup.16 q1, r3
-+ vmovl.u8 q10, d16
-+ vmovl.u8 q11, d17
-+ vmovl.u8 q12, d18
-+ vmovl.u8 q13, d19
-+ vzip.16 q0, q2
-+ vzip.16 q1, q3
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q1, q12
-+ vqadd.s16 q3, q13
-+ vqmovun.s16 d0, q0
-+ vqmovun.s16 d1, q2
-+ vqmovun.s16 d2, q1
-+ vqmovun.s16 d3, q3
-+ vst1.8 {d0}, [r0 :64], r2
-+ vst1.8 {d1}, [ip :64], r2
-+ vst1.8 {d2}, [r0 :64]
-+ vst1.8 {d3}, [ip :64]
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_v(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_v_neon_8, export=1
-+ vdup.16 q15, r3
-+ add ip, r0, r2
-+ push {r4, lr}
-+ vld2.8 {d16, d17}, [r0 :128]
-+ lsl r2, #1
-+ vld2.8 {d18, d19}, [ip :128]
-+ mov r3, #8-2
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ add r4, r0, r2
-+ vmovl.u8 q10, d17
-+ add lr, ip, r2
-+ vmovl.u8 q11, d19
-+ vqadd.s16 q0, q10
-+ vaddw.u8 q2, q15, d16
-+ vqadd.s16 q1, q11
-+ vaddw.u8 q3, q15, d18
-+1:
-+ vqmovun.s16 d20, q2
-+ vqmovun.s16 d21, q0
-+ vld2.8 {d16, d17}, [r4 :128], r2
-+ subs r3, #2
-+ vqmovun.s16 d22, q3
-+ vqmovun.s16 d23, q1
-+ vst2.8 {d20, d21}, [r0 :128], r2
-+ vld2.8 {d18, d19}, [lr :128], r2
-+ vst2.8 {d22, d23}, [ip :128], r2
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vmovl.u8 q10, d17
-+ vmovl.u8 q11, d19
-+ vqadd.s16 q0, q10
-+ vaddw.u8 q2, q15, d16
-+ vqadd.s16 q1, q11
-+ vaddw.u8 q3, q15, d18
-+ bne 1b
-+
-+ vqmovun.s16 d20, q2
-+ vqmovun.s16 d21, q0
-+ vqmovun.s16 d22, q3
-+ vqmovun.s16 d23, q1
-+ vst2.8 {d20, d21}, [r0 :128]
-+ vst2.8 {d22, d23}, [ip :128]
-+ pop {r4, pc}
-+endfunc
-+
-+@ add_residual16x16_v(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_v_neon_8, export=1
-+ vdup.16 q15, r3
-+ add ip, r0, r2
-+ vld2.8 {q8, q9}, [r0 :256]
-+ mov r3, #16-1
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vmovl.u8 q11, d18
-+ vmovl.u8 q12, d19
-+ vqadd.s16 q0, q11
-+ vaddw.u8 q11, q15, d16
-+ vqadd.s16 q1, q12
-+ vaddw.u8 q12, q15, d17
-+1:
-+ vld2.8 {q8, q9}, [ip :256], r2
-+ subs r3, #1
-+ vqmovun.s16 d20, q11
-+ vqmovun.s16 d22, q0
-+ vqmovun.s16 d21, q12
-+ vqmovun.s16 d23, q1
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vst2.8 {q10, q11}, [r0 :256], r2
-+ vmovl.u8 q11, d18
-+ pldw [ip]
-+ vmovl.u8 q12, d19
-+ vqadd.s16 q0, q11
-+ vaddw.u8 q11, q15, d16
-+ vqadd.s16 q1, q12
-+ vaddw.u8 q12, q15, d17
-+ bne 1b
-+
-+ vqmovun.s16 d20, q11
-+ vqmovun.s16 d22, q0
-+ vqmovun.s16 d21, q12
-+ vqmovun.s16 d23, q1
-+ vst2.8 {q10, q11}, [r0 :256]
-+ bx lr
-+endfunc
-+
-+@ ============================================================================
-+@ U & V add
-+
-+@ add_residual4x4_c(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function ff_hevc_rpi_add_residual_4x4_c_neon_8, export=1
-+ add ip, r0, r2
-+ vld1.16 {q0, q1}, [r1]! @ all of U
-+ lsl r2, #1
-+ vld1.8 {d16}, [r0 :64], r2
-+ rsb r3, r2, #0
-+ vld1.8 {d17}, [ip :64], r2
-+ vld1.16 {q2, q3}, [r1] @ all of V
-+ vld1.8 {d18}, [r0 :64], r3
-+ vld1.8 {d19}, [ip :64], r3
-+ vmovl.u8 q10, d16
-+ vmovl.u8 q11, d17
-+ vmovl.u8 q12, d18
-+ vmovl.u8 q13, d19
-+ vzip.16 q0, q2
-+ vzip.16 q1, q3
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q1, q12
-+ vqadd.s16 q3, q13
-+ vqmovun.s16 d0, q0
-+ vqmovun.s16 d1, q2
-+ vqmovun.s16 d2, q1
-+ vqmovun.s16 d3, q3
-+ vst1.8 {d0}, [r0 :64], r2
-+ vst1.8 {d1}, [ip :64], r2
-+ vst1.8 {d2}, [r0 :64]
-+ vst1.8 {d3}, [ip :64]
-+ bx lr
-+endfunc
-+
-+@ add_residual8x8_c(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function ff_hevc_rpi_add_residual_8x8_c_neon_8, export=1
-+ vld2.8 {d16, d17}, [r0 :128]
-+ add r3, r1, #(8*8*2) @ Offset to V
-+ vld1.16 {q0}, [r1 :128]!
-+ add ip, r0, r2
-+ vld1.16 {q1}, [r3 :128]!
-+ vmovl.u8 q10, d16
-+ push {lr}
-+ vmovl.u8 q8, d17
-+ mov lr, #8-1
-+ vqadd.s16 q10, q0
-+ vqadd.s16 q1, q8
-+1:
-+ vld2.8 {d16, d17}, [ip :128], r2
-+ subs lr, #1
-+ vld1.16 {q0}, [r1 :128]!
-+ vqmovun.s16 d20, q10
-+ vqmovun.s16 d21, q1
-+ vld1.16 {q1}, [r3 :128]!
-+ vst2.8 {d20, d21}, [r0 :128], r2
-+ vmovl.u8 q10, d16
-+ pldw [ip]
-+ vmovl.u8 q8, d17
-+ vqadd.s16 q10, q0
-+ vqadd.s16 q1, q8
-+ bne 1b
-+
-+ vqmovun.s16 d20, q10
-+ vqmovun.s16 d21, q1
-+ vst2.8 {d20, d21}, [r0 :128]
-+ pop {pc}
-+endfunc
-+
-+@ add_residual16x16_c(
-+@ uint8_t *_dst, [r0]
-+@ const int16_t *res, [r1]
-+@ ptrdiff_t stride) [r2]
-+
-+function ff_hevc_rpi_add_residual_16x16_c_neon_8, export=1
-+ vld2.8 {q8, q9}, [r0 :256]
-+ add r3, r1, #(16*16*2) @ Offset to V
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ add ip, r0, r2
-+ vld1.16 {q2, q3}, [r3 :256]!
-+ vmovl.u8 q10, d16
-+ push {lr}
-+ vmovl.u8 q8, d17
-+ mov lr, #16-1
-+ vmovl.u8 q11, d18
-+ vmovl.u8 q9, d19
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q1, q8
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q3, q9
-+1:
-+ vld2.8 {q8, q9}, [ip :256], r2
-+ subs lr, #1
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d22, q2
-+ vqmovun.s16 d21, q1
-+ vqmovun.s16 d23, q3
-+ vld1.16 {q0, q1}, [r1 :256]!
-+ vst2.8 {d20-d23}, [r0 :256], r2
-+ vld1.16 {q2, q3}, [r3 :256]!
-+ vmovl.u8 q10, d16
-+ pldw [ip]
-+ vmovl.u8 q8, d17
-+ vmovl.u8 q11, d18
-+ vmovl.u8 q9, d19
-+ vqadd.s16 q0, q10
-+ vqadd.s16 q1, q8
-+ vqadd.s16 q2, q11
-+ vqadd.s16 q3, q9
-+ bne 1b
-+
-+ vqmovun.s16 d20, q0
-+ vqmovun.s16 d22, q2
-+ vqmovun.s16 d21, q1
-+ vqmovun.s16 d23, q3
-+ vst2.8 {d20-d23}, [r0 :256]
-+ pop {pc}
-+endfunc
-+
-+@ 32x32 chroma never occurs so NIF
-+
-+@ ============================================================================
-diff --git a/libavcodec/arm/rpi_hevcdsp_sao_neon.S b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-new file mode 100644
-index 0000000000..b56e0f9644
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcdsp_sao_neon.S
-@@ -0,0 +1,2245 @@
-+/*
-+ * Copyright (c) 2014 - 2015 Seppo Tomperi <seppo.tomperi@vtt.fi>
-+ * 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.set EDGE_SRC_STRIDE, 160
-+
-+@ PIC jump tables are fractionally more expensive than absolute in our code
-+.set jent_pic, CONFIG_PIC
-+
-+
-+.macro sao_band_64b_8 XLAT0, XLAT1, Q_K128, I1, I2, I3, I4
-+ vshr.u8 q12, q8, #3
-+ \I1
-+ vadd.i8 q8, \Q_K128
-+ \I2
-+ vshr.u8 q13, q9, #3
-+ \I3
-+ vadd.i8 q9, \Q_K128
-+ \I4
-+ vtbl.8 d24, \XLAT0, d24
-+ vtbl.8 d25, \XLAT0, d25
-+ vtbl.8 d26, \XLAT1, d26
-+ vtbl.8 d27, \XLAT1, d27
-+
-+ vqadd.s8 q8, q12
-+ vshr.u8 q12, q10, #3
-+ vadd.i8 q10, \Q_K128
-+ vqadd.s8 q9, q13
-+ vshr.u8 q13, q11, #3
-+ vadd.i8 q11, \Q_K128
-+
-+ vtbl.8 d24, \XLAT0, d24
-+ vtbl.8 d25, \XLAT0, d25
-+ vtbl.8 d26, \XLAT1, d26
-+ vtbl.8 d27, \XLAT1, d27
-+ vqadd.s8 q10, q12
-+ vsub.i8 q8, \Q_K128
-+ vqadd.s8 q11, q13
-+ vsub.i8 q9, \Q_K128
-+ vsub.i8 q10, \Q_K128
-+ vsub.i8 q11, \Q_K128
-+.endm
-+
-+.macro sao_band_16b_8 XLAT0, XLAT1, Q_K128, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+ \L1
-+ \L2
-+ \L3
-+ \L4
-+ \L5
-+ vadd.i8 q12, q8, \Q_K128
-+ vshr.u8 q8, #3
-+ vtbl.8 d16, \XLAT0, d16
-+ vtbl.8 d17, \XLAT1, d17
-+ vqadd.s8 q12, q8
-+ bmi 2f
-+1: \L1
-+ \L2
-+ \L3
-+ \L4
-+ \L5
-+ vsub.i8 q13, q12, \Q_K128
-+ vadd.i8 q12, q8, \Q_K128
-+ vshr.u8 q8, #3
-+ \S1
-+ \S2
-+ \S3
-+ \S4
-+ vtbl.8 d16, \XLAT0, d16
-+ vtbl.8 d17, \XLAT1, d17
-+ vqadd.s8 q12, q8
-+ bpl 1b
-+2: vsub.i8 q13, q12, \Q_K128
-+ \S1
-+ \S2
-+ \S3
-+ \S4
-+.endm
-+
-+
-+.macro clip16_4 Q0, Q1, Q2, Q3, Q_MIN, Q_MAX
-+ vmax.s16 \Q0, \Q_MIN
-+ vmax.s16 \Q1, \Q_MIN
-+ vmax.s16 \Q2, \Q_MIN
-+ vmax.s16 \Q3, \Q_MIN
-+ vmin.s16 \Q0, \Q_MAX
-+ vmin.s16 \Q1, \Q_MAX
-+ vmin.s16 \Q2, \Q_MAX
-+ vmin.s16 \Q3, \Q_MAX
-+.endm
-+
-+@ Clobbers q12, q13
-+.macro sao_band_64b_16 Q0, Q1, Q2, Q3, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, I1, I2
-+ vshrn.i16 d24, \Q0, #(\bit_depth - 5)
-+ vshrn.i16 d25, \Q1, #(\bit_depth - 5)
-+ vshrn.i16 d26, \Q2, #(\bit_depth - 5)
-+ \I1
-+ vtbl.8 d24, \XLAT0, d24
-+ vshrn.i16 d27, \Q3, #(\bit_depth - 5)
-+ vtbl.8 d25, \XLAT1, d25
-+ \I2
-+ vtbl.8 d26, \XLAT0, d26
-+ vtbl.8 d27, \XLAT1, d27
-+ vaddw.s8 \Q0, d24
-+ vaddw.s8 \Q1, d25
-+ vaddw.s8 \Q2, d26
-+ vaddw.s8 \Q3, d27
-+ clip16_4 \Q0, \Q1, \Q2, \Q3, \Q_MIN, \Q_MAX
-+.endm
-+
-+@ Clobbers q10, q11, q12
-+.macro sao_band_32b_16 Q0, Q1, XLAT0, XLAT1, Q_MIN, Q_MAX, bit_depth, L1, L2, L3, L4, L5, S1, S2, S3, S4
-+ \L1
-+ \L2
-+ \L3
-+ \L4
-+ \L5
-+ vshrn.i16 d24, \Q0, #\bit_depth - 5
-+ vshrn.i16 d25, \Q1, #\bit_depth - 5
-+ vtbl.8 d24, \XLAT0, d24
-+ vtbl.8 d25, \XLAT1, d25
-+ vaddw.s8 q10, \Q0, d24
-+ vaddw.s8 q11, \Q1, d25
-+ bmi 2f
-+1: \L1
-+ \L2
-+ \L3
-+ \L4
-+ \L5
-+ vmax.s16 q10, \Q_MIN
-+ vmax.s16 q11, \Q_MIN
-+ vshrn.i16 d24, \Q0, #\bit_depth - 5
-+ vshrn.i16 d25, \Q1, #\bit_depth - 5
-+ vmin.s16 q10, \Q_MAX
-+ vmin.s16 q11, \Q_MAX
-+ \S1
-+ \S2
-+ \S3
-+ \S4
-+ vtbl.8 d24, \XLAT0, d24
-+ vtbl.8 d25, \XLAT1, d25
-+ vaddw.s8 q10, \Q0, d24
-+ vaddw.s8 q11, \Q1, d25
-+ bpl 1b
-+2: vmax.s16 q10, \Q_MIN
-+ vmax.s16 q11, \Q_MIN
-+ vmin.s16 q10, \Q_MAX
-+ vmin.s16 q11, \Q_MAX
-+ \S1
-+ \S2
-+ \S3
-+ \S4
-+.endm
-+
-+
-+@ Standard coding rules for sao_offset_abs limit it to 0-31 (Table 9-38)
-+@ so we are quite safe stuffing it into a byte array
-+@ There may be a subsequent shl by log2_sao_offset_scale_luma/chroma
-+@ (7.4.3.3.2 && 7-70) but we should still be safe to at least 12 bits of
-+@ precision
-+
-+@ This, somewhat nasty, bit of code builds the {d0-d3} translation
-+@ array via the stack
-+@ Given that sao_left_class > 28 can cause wrap we can't just poke
-+@ all 4 bytes in at once
-+@
-+@ It also loads other common regs
-+
-+@ Beware that the offset read here overrreads by 6 bytes so source must be sized appropriately
-+function band_load_y
-+ ldr ip, [sp, #16] @ &sao_offset_val[0]
-+ ldr r4, [sp, #20] @ sao_left_class
-+ vmov.i64 d4, #0
-+ vmov.i64 q0, #0
-+ pld [r1]
-+ vld2.8 {q8}, [ip]
-+ sub ip, sp, #8*5
-+ vmov.i64 q1, #0
-+ add r4, ip, r4
-+ vpush {d0-d4} @ Put zero array on stack
-+ vshr.u64 d16, d16, #8 @ 1st interesting val is [1]
-+ ldr ip, [ip, #8*5 + 28] @ height
-+ vst1.32 {d16[0]}, [r4]
-+ add r4, r1, r3
-+ vpop {d0-d4} @ Pop modified array
-+ sub ip, ip, #1
-+ vorr d0, d0, d4
-+ bx lr
-+endfunc
-+
-+@ Beware that offset reads here overrread by 6 bytes so source must be sized appropriately
-+function band_load_c
-+ ldr ip, [sp, #16] @ &sao_offset_val1[0]
-+ ldr r4, [sp, #20] @ sao_left_class1
-+ vmov.i64 d24, #0
-+ vmov.i64 q10, #0
-+ pld [r1]
-+ vld2.8 {q8}, [ip]
-+ sub ip, sp, #8*5
-+ vmov.i64 q11, #0
-+ add r4, ip, r4
-+ ldr ip, [sp, #24] @ &sao_offset_val2[0]
-+ vpush {d20-d24} @ Put zero array on stack
-+ vld2.8 {q9}, [ip]
-+ vshr.u64 d16, d16, #8 @ 1st interesting val is [1]
-+ ldr ip, [sp, #8*5 + 28] @ sao_left_class2
-+ vst1.32 {d16[0]}, [r4]
-+ add ip, sp, ip
-+ vshr.u64 d18, d18, #8 @ 1st interesting val is [1]
-+ vldmia sp, {d0-d3} @ Load modified array
-+ vldr d16, [sp, #8*4]
-+ add r4, r1, r3
-+ vstmia sp, {d20-d24} @ Put zero array on stack (again)
-+ vst1.32 {d18[0]}, [ip]
-+ vorr d0, d0, d16
-+ vldmia sp, {d4-d7} @ Load modified array
-+ vldr d18, [sp, #8*4]
-+ ldr ip, [sp, #8*5 + 36] @ height
-+ add sp, sp, #8*5
-+ vorr d4, d4, d18
-+ sub ip, ip, #1
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_8 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_64_neon_8, export=1
-+ push {r4-r6, lr}
-+ vmov.u8 q15, #128
-+ bl band_load_y
-+
-+1: vldmia r1, {q8-q11}
-+ sao_band_64b_8 {d0-d3}, {d0-d3}, q15, \
-+ "pld [r4]", \
-+ "subs ip, #1", \
-+ "it ne; addne r4, r3", \
-+ "add r1, r3"
-+ vstmia r0, {q8-q11}
-+ add r0, r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_8 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_32_neon_8, export=1
-+ push {r4-r6, lr}
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ vmov.u8 q15, #128
-+ bl band_load_y
-+
-+1: vld1.8 { q8, q9 }, [r1, :128], r3
-+ subs ip, #2
-+ vld1.8 {q10, q11}, [r6, :128], r3
-+
-+ sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+ vst1.8 { q8, q9 }, [r0, :128], r2
-+ vst1.8 {q10, q11}, [r5, :128], r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_8 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_16_neon_8, export=1
-+ push {r4-r6, lr}
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ vmov.u8 q15, #128
-+ bl band_load_y
-+
-+1: vld1.8 { q8}, [r1, :128], r3
-+ subs ip, #4
-+ vld1.8 { q9}, [r6, :128], r3
-+ vld1.8 {q10}, [r1, :128], r3
-+ vld1.8 {q11}, [r6, :128], r3
-+
-+ sao_band_64b_8 {d0-d3}, {d0-d3}, q15
-+
-+ vst1.8 { q8}, [r0, :128], r2
-+ vst1.8 { q9}, [r5, :128], r2
-+ vst1.8 {q10}, [r0, :128], r2
-+ vst1.8 {q11}, [r5, :128], r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_8 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_band_8_neon_8, export=1
-+ ldr ip, [sp, #8] @ width
-+ push {r4-r6, lr}
-+ vmov.u8 q15, #128
-+ cmp ip, #8
-+ bl band_load_y
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ blt 4f
-+
-+ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
-+ "vld1.8 {d16}, [r1, :64], r3", \
-+ "subs ip, #2", \
-+ "vld1.8 {d17}, [r6, :64], r3", \
-+ "", \
-+ "", \
-+ "vst1.8 {d26}, [r0, :64], r2", \
-+ "vst1.8 {d27}, [r5, :64], r2"
-+ pop {r4-r6, pc}
-+4:
-+ sao_band_16b_8 {d0-d3}, {d0-d3}, q15, \
-+ "vld1.32 {d16[0]}, [r1, :32], r3", \
-+ "subs ip, #4", \
-+ "vld1.32 {d16[1]}, [r6, :32], r3", \
-+ "vld1.32 {d17[0]}, [r1, :32], r3", \
-+ "vld1.32 {d17[1]}, [r6, :32], r3", \
-+ "vst1.32 {d26[0]}, [r0, :32], r2", \
-+ "vst1.32 {d26[1]}, [r5, :32], r2", \
-+ "vst1.32 {d27[0]}, [r0, :32], r2", \
-+ "vst1.32 {d27[1]}, [r5, :32], r2"
-+ pop {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_8(
-+@ uint8_t * dst [r0]
-+@ uint8_t * src [r1]
-+@ uint32_t dst_stride [r2]
-+@ uint32_t src_stride [r3]
-+@ const int16_t * table1 sp[0]
-+@ uint32_t offset1 sp[4]
-+@ const int16_t * table2 sp[8]
-+@ uint32_t offset2 sp[12]
-+@ int width sp[16]
-+@ int height sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_8, export=1
-+ push {r4-r6, lr}
-+ add r5, r0, #32
-+ add r6, r1, #32
-+ vmov.u8 q15, #128
-+ bl band_load_c
-+
-+1: vld2.8 { q8, q9 }, [r1, :128], r3
-+ subs ip, #1
-+ vld2.8 {q10, q11}, [r6, :128], r3
-+
-+ sao_band_64b_8 {d0-d3}, {d4-d7}, q15, \
-+ "pld [r4]", \
-+ "it ne; addne r4, r3"
-+
-+ vst2.8 { q8, q9 }, [r0, :128], r2
-+ vst2.8 {q10, q11}, [r5, :128], r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_8(
-+@ uint8_t * dst [r0]
-+@ uint8_t * src [r1]
-+@ uint32_t dst_stride [r2]
-+@ uint32_t src_stride [r3]
-+@ const int16_t * table1 sp[0]
-+@ uint32_t offset1 sp[4]
-+@ const int16_t * table2 sp[8]
-+@ uint32_t offset2 sp[12]
-+@ int width sp[16]
-+@ int height sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_8, export=1
-+ push {r4-r6, lr}
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ vmov.u8 q15, #128
-+ bl band_load_c
-+
-+1: vld2.8 { q8, q9 }, [r1, :128], r3
-+ subs ip, #2
-+ vld2.8 {q10, q11}, [r6, :128], r3
-+
-+ sao_band_64b_8 {d0-d3}, {d4-d7}, q15
-+
-+ vst2.8 { q8, q9 }, [r0, :128], r2
-+ vst2.8 {q10, q11}, [r5, :128], r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_8(
-+@ uint8_t * dst [r0]
-+@ uint8_t * src [r1]
-+@ uint32_t dst_stride [r2]
-+@ uint32_t src_stride [r3]
-+@ const int16_t * table1 sp[0]
-+@ uint32_t offset1 sp[4]
-+@ const int16_t * table2 sp[8]
-+@ uint32_t offset2 sp[12]
-+@ int width sp[16]
-+@ int height sp[20]
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_8, export=1
-+ ldr ip, [sp, #16] @ width
-+ push {r4-r6, lr}
-+ vmov.u8 q15, #128
-+ cmp ip, #8
-+ bl band_load_c
-+ blt 4f
-+
-+ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
-+ "vld2.8 {d16-d17}, [r1, :128], r3", \
-+ "subs ip, #1", \
-+ "", \
-+ "", \
-+ "", \
-+ "vst2.8 {d26-d27}, [r0, :128], r2"
-+ pop {r4-r6, pc}
-+4:
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ sao_band_16b_8 {d0-d3}, {d4-d7}, q15, \
-+ "vld1.8 {d16}, [r1, :64], r3", \
-+ "subs ip, #2", \
-+ "vld1.8 {d17}, [r6, :64], r3", \
-+ "vuzp.8 d16, d17", \
-+ "", \
-+ "vzip.8 d26, d27", \
-+ "vst1.8 {d26}, [r0, :64], r2", \
-+ "vst1.8 {d27}, [r5, :64], r2"
-+ pop {r4-r6, pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_64_neon_10 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+.macro band_64_16 bit_depth
-+ push {r4-r6, lr}
-+ vmov.i64 q2, #0
-+ vmov.i16 q3, #(1 << \bit_depth) - 1
-+ bl band_load_y
-+ vpush {q4-q7}
-+
-+1: vldm r1, {q4-q11}
-+ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+ "subs ip, #1", \
-+ "add r1, r3"
-+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth
-+ vstm r0, {q4-q11}
-+ add r0, r2
-+ bpl 1b
-+
-+ vpop {q4-q7}
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_64_neon_10, export=1
-+ band_64_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_32_neon_10 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+.macro band_32_16 bit_depth
-+ push {r4-r6, lr}
-+ vmov.i64 q2, #0
-+ vmov.i16 q3, #(1 << \bit_depth) - 1
-+ bl band_load_y
-+
-+1: vldm r1, {q8-q11}
-+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q2, q3, \bit_depth, \
-+ "subs ip, #1", \
-+ "add r1, r3"
-+ vstm r0, {q8-q11}
-+ add r0, r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_32_neon_10, export=1
-+ band_32_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_16_neon_10 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+.macro band_16_16 bit_depth
-+ push {r4-r6, lr}
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ vmov.i64 q14, #0
-+ vmov.i16 q15, #(1 << \bit_depth) - 1
-+ bl band_load_y
-+
-+1: vld1.16 { q8, q9 }, [r1, :128], r3
-+ subs r12, #2
-+ vld1.16 {q10, q11}, [r6, :128], r3
-+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d0-d3}, q14, q15, \bit_depth
-+ vst1.16 { q8, q9 }, [r0, :128], r2
-+ vst1.16 {q10, q11}, [r5, :128], r2
-+ bpl 1b
-+
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_16_neon_10, export=1
-+ band_16_16 10
-+endfunc
-+
-+@ ff_hevc_rpi_sao_band_8_neon_10 (
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ ptrdiff_t stride_src, [r3]
-+@ int16_t *sao_offset_val, [sp, #0]
-+@ int sao_left_class, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+.macro band_8_16 bit_depth
-+ ldr ip, [sp, #8] @ width
-+ push {r4-r6, lr}
-+ vmov.i64 q14, #0
-+ cmp ip, #8
-+ vmov.i16 q15, #(1 << \bit_depth) - 1
-+ bl band_load_y
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ blt 4f
-+
-+ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+ "vld1.16 {q8}, [r1, :128], r3", \
-+ "subs ip, #2", \
-+ "vld1.16 {q9}, [r6, :128], r3", \
-+ "", \
-+ "", \
-+ "vst1.16 {q10}, [r0, :128], r2", \
-+ "vst1.16 {q11}, [r5, :128], r2"
-+ pop {r4-r6, pc}
-+4:
-+ sao_band_32b_16 q8, q9, {d0-d3}, {d0-d3}, q14, q15, \bit_depth, \
-+ "vld1.16 {d16}, [r1, :64], r3", \
-+ "subs ip, #4", \
-+ "vld1.16 {d17}, [r6, :64], r3", \
-+ "vld1.16 {d18}, [r1, :64], r3", \
-+ "vld1.16 {d19}, [r6, :64], r3", \
-+ "vst1.16 {d20}, [r0, :64], r2", \
-+ "vst1.16 {d21}, [r5, :64], r2", \
-+ "vst1.16 {d22}, [r0, :64], r2", \
-+ "vst1.16 {d23}, [r5, :64], r2"
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_8_neon_10, export=1
-+ band_8_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_32_neon_10(
-+@ uint8_t * dst [r0]
-+@ uint8_t * src [r1]
-+@ uint32_t dst_stride [r2]
-+@ uint32_t src_stride [r3]
-+@ const int16_t * table1 sp[0]
-+@ uint32_t offset1 sp[4]
-+@ const int16_t * table2 sp[8]
-+@ uint32_t offset2 sp[12]
-+@ int width sp[16]
-+@ int height sp[20]
-+
-+.macro band_c_32_16 bit_depth
-+ push {r4-r6, lr}
-+ add r5, r0, #32
-+ add r6, r1, #32
-+ sub r2, #64
-+ sub r3, #64
-+ vmov.i64 q14, #0
-+ vmov.i16 q15, #(1 << \bit_depth) - 1
-+ bl band_load_c
-+ mov lr, #64
-+ vpush {q4-q7}
-+
-+1: vld2.16 { q4, q5 }, [r1, :128], lr
-+ subs ip, #1
-+ vld2.16 { q6, q7 }, [r6, :128], lr
-+ vld2.16 { q8, q9 }, [r1, :128], r3
-+ vld2.16 {q10, q11}, [r6, :128], r3
-+
-+ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+ "pld [r4]", \
-+ "it ne; addne r4, r3"
-+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+ vst2.16 { q4, q5 }, [r0, :128], lr
-+ vst2.16 { q6, q7 }, [r5, :128], lr
-+ vst2.16 { q8, q9 }, [r0, :128], r2
-+ vst2.16 {q10, q11}, [r5, :128], r2
-+
-+ bpl 1b
-+
-+ vpop {q4-q7}
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_32_neon_10, export=1
-+ band_c_32_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_16_neon_10(
-+@ uint8_t * dst [r0]
-+@ uint8_t * src [r1]
-+@ uint32_t dst_stride [r2]
-+@ uint32_t src_stride [r3]
-+@ const int16_t * table1 sp[0]
-+@ uint32_t offset1 sp[4]
-+@ const int16_t * table2 sp[8]
-+@ uint32_t offset2 sp[12]
-+@ int width sp[16]
-+@ int height sp[20]
-+
-+.macro band_c_16_16 bit_depth
-+ push {r4-r6, lr}
-+ add r5, r0, #32
-+ add r6, r1, #32
-+ vmov.i64 q14, #0
-+ vmov.i16 q15, #(1 << \bit_depth) - 1
-+ bl band_load_c
-+
-+1: vld2.16 { q8, q9 }, [r1, :128], r3
-+ subs ip, #1
-+ vld2.16 {q10, q11}, [r6, :128], r3
-+
-+ sao_band_64b_16 q4, q5, q6, q7, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+ sao_band_64b_16 q8, q9, q10, q11, {d0-d3}, {d4-d7}, q14, q15, \bit_depth
-+
-+ vst2.16 { q8, q9 }, [r0, :128], r2
-+ vst2.16 {q10, q11}, [r5, :128], r2
-+
-+ bpl 1b
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_16_neon_10, export=1
-+ band_c_16_16 10
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_band_c_8_neon_10(
-+@ uint8_t * dst [r0]
-+@ uint8_t * src [r1]
-+@ uint32_t dst_stride [r2]
-+@ uint32_t src_stride [r3]
-+@ const int16_t * table1 sp[0]
-+@ uint32_t offset1 sp[4]
-+@ const int16_t * table2 sp[8]
-+@ uint32_t offset2 sp[12]
-+@ int width sp[16]
-+@ int height sp[20]
-+
-+.macro band_c_8_16 bit_depth
-+ ldr ip, [sp, #16] @ width
-+ push {r4-r6, lr}
-+ vmov.i64 q14, #0
-+ cmp ip, #8
-+ vmov.i16 q15, #(1 << \bit_depth) - 1
-+ bl band_load_c
-+ blt 4f
-+
-+ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+ "vld2.16 {q8,q9}, [r1, :128], r3", \
-+ "subs ip, #1", \
-+ "", \
-+ "", \
-+ "", \
-+ "vst2.16 {q10,q11}, [r0, :128], r2"
-+ pop {r4-r6, pc}
-+4:
-+ add r5, r0, r2
-+ add r6, r1, r3
-+ lsl r2, #1
-+ lsl r3, #1
-+ sao_band_32b_16 q8, q9, {d0-d3}, {d4-d7}, q14, q15, \bit_depth, \
-+ "vld2.16 {d16,d18}, [r1, :128], r3", \
-+ "subs ip, #2", \
-+ "vld2.16 {d17,d19}, [r6, :128], r3", \
-+ "", \
-+ "", \
-+ "vst2.16 {d20,d22}, [r0, :128], r2", \
-+ "vst2.16 {d21,d23}, [r5, :128], r2"
-+ pop {r4-r6, pc}
-+.endm
-+
-+function ff_hevc_rpi_sao_band_c_8_neon_10, export=1
-+ band_c_8_16 10
-+endfunc
-+
-+
-+@ =============================================================================
-+@ SAO EDGE
-+
-+@ r0 destination address
-+@ r2 stride to post-increment r0 with
-+@ [r5] translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27. For Y d26=d27
-+
-+function edge_64b_body_8
-+
-+ vcgt.u8 q12, q4, q0 @ c > a -> -1 , otherwise 0
-+ vcgt.u8 q13, q5, q1
-+ vcgt.u8 q14, q6, q2
-+ vcgt.u8 q15, q7, q3
-+
-+ vcgt.u8 q0, q4 @ a > c -> -1 , otherwise 0
-+ vcgt.u8 q1, q5
-+ vcgt.u8 q2, q6
-+ vcgt.u8 q3, q7
-+
-+ vsub.s8 q0, q12 @ a = sign(c-a)
-+ vsub.s8 q1, q13
-+ vsub.s8 q2, q14
-+ vsub.s8 q3, q15
-+
-+ vcgt.u8 q12, q4, q8 @ c > b -> -1 , otherwise 0
-+ vcgt.u8 q13, q5, q9
-+ vcgt.u8 q14, q6, q10
-+ vcgt.u8 q15, q7, q11
-+
-+ vsub.s8 q0, q12
-+ vsub.s8 q1, q13
-+ vsub.s8 q2, q14
-+ vsub.s8 q3, q15
-+
-+ vcgt.u8 q12, q8, q4 @ c < b -> -1 , otherwise 0
-+ vcgt.u8 q13, q9, q5
-+ vcgt.u8 q14, q10, q6
-+ vcgt.u8 q15, q11, q7
-+
-+ vadd.s8 q0, q12 @ a = sign(c-a) + sign(c-b)
-+ vadd.s8 q1, q13
-+ vmov.u8 q12, #2
-+ vadd.s8 q2, q14
-+ vadd.s8 q3, q15
-+
-+ vadd.s8 q0, q12
-+ vadd.s8 q1, q12
-+
-+ vld1.8 {d26, d27}, [r5]
-+
-+ vadd.s8 q2, q12
-+ vuzp.8 q0, q1
-+ vmov.u8 q15, #128
-+ vadd.s8 q3, q12 @ a = 2 + sign(c-a) + sign(c-b)
-+
-+ vtbl.8 d0, {d26}, d0
-+ vadd.s8 q12, q4, q15 @ Add -128 so we can use saturating signed add
-+
-+ vtbl.8 d1, {d26}, d1
-+ vadd.s8 q14, q5, q15
-+
-+ vtbl.8 d2, {d27}, d2
-+ vuzp.8 q2, q3
-+
-+ vtbl.8 d3, {d27}, d3
-+
-+ vtbl.8 d4, {d26}, d4
-+ vzip.8 q0, q1
-+
-+ vtbl.8 d5, {d26}, d5
-+ vqadd.s8 q0, q12
-+ vqadd.s8 q1, q14
-+ vadd.s8 q12, q6, q15 @ Add -128 so we can use saturating signed add
-+
-+ vtbl.8 d6, {d27}, d6
-+ vtbl.8 d7, {d27}, d7
-+ vadd.s8 q14, q7, q15 @ Add -128 so we can use saturating signed add
-+ vzip.8 q2, q3
-+
-+ vsub.s8 q0, q15
-+ vqadd.s8 q2, q12
-+ vqadd.s8 q3, q14
-+ vsub.s8 q1, q15
-+ vsub.s8 q2, q15
-+ vsub.s8 q3, q15
-+
-+ bx lr
-+endfunc
-+
-+@ r0 destination address
-+@ r2 stride to post-increment r0 with
-+@ r4 upper clip value
-+@ [r5] translate values
-+@
-+@ a <- c <- b
-+@ a in q0 - q3
-+@ c in q4 - q7
-+@ b in q8 - q11
-+@
-+@ q12-15 used as temp
-+@
-+@ Can be used for both Y & C as we unzip/zip the deltas and
-+@ transform "u/v" separately via d26/d27. For Y d26=d27
-+
-+function edge_64b_body_16
-+
-+ vcgt.u16 q12, q4, q0 // c > a -> -1 , otherwise 0
-+ vcgt.u16 q13, q5, q1
-+ vcgt.u16 q14, q6, q2
-+ vcgt.u16 q15, q7, q3
-+
-+ vcgt.u16 q0, q0, q4 // a > c -> -1 , otherwise 0
-+ vcgt.u16 q1, q1, q5
-+ vcgt.u16 q2, q2, q6
-+ vcgt.u16 q3, q3, q7
-+
-+ vsub.s16 q0, q0, q12 // a = sign(c-a)
-+ vsub.s16 q1, q1, q13
-+ vsub.s16 q2, q2, q14
-+ vsub.s16 q3, q3, q15
-+
-+ vcgt.u16 q12, q4, q8 // c > b -> -1 , otherwise 0
-+ vcgt.u16 q13, q5, q9
-+ vcgt.u16 q14, q6, q10
-+ vcgt.u16 q15, q7, q11
-+
-+ vsub.s16 q0, q0, q12
-+ vsub.s16 q1, q1, q13
-+ vsub.s16 q2, q2, q14
-+ vsub.s16 q3, q3, q15
-+
-+ vcgt.u16 q12, q8, q4 // c < b -> -1 , otherwise 0
-+ vcgt.u16 q13, q9, q5
-+ vcgt.u16 q14, q10, q6
-+ vcgt.u16 q15, q11, q7
-+
-+ vadd.s16 q0, q0, q12 // a = sign(c-a) + sign(c-b)
-+ vadd.s16 q1, q1, q13
-+ vadd.s16 q2, q2, q14
-+ vadd.s16 q3, q3, q15
-+
-+ vmov.u8 q12, #2
-+
-+ vmovn.s16 d0, q0
-+ vmovn.s16 d1, q1
-+ vmovn.s16 d2, q2
-+ vmovn.s16 d3, q3
-+
-+ vldr d26, [r5]
-+
-+ vuzp.8 q0, q1
-+
-+ vldr d27, [r5, #8]
-+
-+ vadd.s8 q0, q0, q12
-+ vadd.s8 q1, q1, q12
-+
-+ vmov.i64 q12, #0
-+
-+ vtbl.8 d0, {d26}, d0
-+ vtbl.8 d1, {d26}, d1
-+ vtbl.8 d2, {d27}, d2
-+ vtbl.8 d3, {d27}, d3
-+
-+ vdup.i16 q13, r4
-+
-+ vzip.8 q0, q1
-+
-+ @ Avoid overwrite whilst widening
-+ vaddw.s8 q2, q6, d2
-+ vaddw.s8 q3, q7, d3
-+ vaddw.s8 q1, q5, d1
-+ vaddw.s8 q0, q4, d0
-+
-+ @ now clip
-+ clip16_4 q2, q3, q1, q0, q12, q13
-+
-+ bx lr
-+endfunc
-+
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3, q9, q10
-+@
-+@ d16, d17 (q8) xlat U, V
-+@ q14.u8 #2
-+@ q15.u8 #128
-+
-+function edge_16b_body_8
-+ vcgt.u8 q9, q0, q1 @ a > c -> -1 , otherwise 0
-+ vadd.u8 q9, q14, q9
-+ vcgt.u8 q0, q1, q0 @ c > a -> -1 , otherwise 0
-+ vsub.u8 q9, q9, q0
-+ vcgt.u8 q0, q2, q1 @ c < b -> -1 , otherwise 0
-+ vadd.u8 q9, q9, q0
-+ vcgt.u8 q0, q1, q2 @ c > b -> -1 , otherwise 0
-+ vsub.u8 q0, q9, q0
-+
-+ vadd.s8 q3, q1, q15 @ Add -128 so we can use saturating signed add
-+
-+ vuzp.8 d0, d1
-+
-+ vtbl.8 d0, {d16}, d0
-+ vtbl.8 d1, {d17}, d1
-+
-+ vzip.8 d0, d1
-+ vqadd.s8 q0, q3
-+ vsub.s8 q0, q15
-+
-+ bx lr
-+endfunc
-+
-+@ a <- c <- b
-+@ a in q0
-+@ c in q1
-+@ b in q2
-+@ Temp q3
-+@
-+@ q12, #0
-+@ d16, d17 xlat U, V
-+@ q14.u8 #2
-+@ q15.u16 max
-+function edge_16b_body_16
-+ vcgt.u16 q9, q0, q1 @ a > c -> -1 , otherwise 0
-+ vadd.u16 q9, q14, q9
-+ vcgt.u16 q0, q1, q0 @ c > a -> -1 , otherwise 0
-+ vsub.u16 q9, q9, q0
-+ vcgt.u16 q0, q2, q1 @ c < b -> -1 , otherwise 0
-+ vadd.u16 q9, q9, q0
-+ vcgt.u16 q0, q1, q2 @ c > b -> -1 , otherwise 0
-+ vsub.u16 q0, q9, q0
-+
-+ vmovn.s16 d0, q0
-+ @ d1 will have random contents that we transform but
-+ @ that doesn't matter as we then discard them
-+ vuzp.8 d0, d1
-+
-+ vtbl.8 d0, {d16}, d0
-+ vtbl.8 d1, {d17}, d1
-+
-+ vzip.8 d0, d1
-+
-+ vaddw.s8 q0, q1, d0
-+
-+ @ now clip
-+ vmax.s16 q0, q12
-+ vmin.s16 q0, q15
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_[c_]xx_neon(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0] // Chroma only
-+@ int eo, [sp, #sp_base + 0]
-+@ int width, [sp, #sp_base + 4]
-+@ int height) [sp, #sp_base + 8]
-+
-+@ Jumps via jump_tab with
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ EDGE_SRC_STRIDE [r3]
-+@ (1 << \bit_depth) - 1 [r4]
-+@ * xlat_table [r5] // setup_64b only
-+@ int height [r12]
-+@
-+@ 0 [q12] // > 8 bit
-+@ 2 [q14]
-+@ 128 [q15] // = 8 bit
-+@ r4 [q15] // > 8 bit
-+
-+.macro edge_xxb_init, bit_depth, is_chroma, jump_tab, setup_64b = 0, setup_16b = 0, check_w4 = 0, do2 = 0, xjump = 0
-+
-+@ Build translate registers
-+@ As translate values can only be 0-4 we don't care about junk in the rest
-+@ of the register
-+.if \is_chroma
-+ ldr ip, [sp, #0]
-+ push {r4-r6, lr} @ 16 bytes
-+ vld1.8 {d16[2]}, [r3]
-+ add r3, r3, #2
-+ vld1.8 {d17[2]}, [ip]
-+ add ip, ip, #2
-+ vld1.8 {d16[0]}, [r3]
-+ add r3, r3, #2
-+ vld1.8 {d17[0]}, [ip]
-+ add ip, ip, #2
-+ vld1.8 {d16[1]}, [r3]
-+ add r3, r3, #2
-+ vld1.8 {d17[1]}, [ip]
-+ add ip, ip, #2
-+ vld1.8 {d16[3]}, [r3]
-+ add r3, r3, #2
-+ vld1.8 {d17[3]}, [ip]
-+ add ip, ip, #2
-+ vld1.8 {d16[4]}, [r3]
-+ vld1.8 {d17[4]}, [ip]
-+ movw r3, EDGE_SRC_STRIDE
-+.set sp_base, 20
-+.else
-+ add ip, r3, #4
-+ vld1.8 {d16[1]}, [r3]
-+ add r3, r3, #2
-+ vld1.8 {d17[0]}, [ip]
-+ add ip, ip, #2
-+ vld1.8 {d16[0]}, [r3]
-+ add r3, r3, #6
-+ vld1.8 {d17[1]}, [ip]
-+ vld1.8 {d16[2]}, [r3]
-+ movw r3, EDGE_SRC_STRIDE
-+ push {r4-r6, lr} @ 16 bytes
-+ vzip.8 d16, d17
-+ vmov d17, d16
-+.set sp_base, 16
-+.endif
-+
-+@ If setup_64b we need the xlat table on the stack
-+.if \setup_64b
-+ sub r5, sp, #16
-+.endif
-+
-+@ Get jump address
-+@ We have a special case for width 4 as the calling code doesn't detect it
-+@ If we may have w4 then we add a 2nd jump table after the 1st
-+.if \check_w4
-+ ldr r12, [sp, #sp_base + 4] @ width
-+ adr r6, \jump_tab
-+ ldr lr, [sp, #sp_base + 0] @ e0
-+ cmp r12, #8
-+ it lt
-+ addlt r6, #16
-+.else
-+ ldr lr, [sp, #sp_base + 0] @ e0
-+ adr r6, \jump_tab
-+.endif
-+
-+ ldr r12, [sp, #sp_base + 8] @ height
-+
-+.if \bit_depth > 8
-+ movw r4, (1 << \bit_depth) - 1
-+.endif
-+.if \setup_16b
-+.if \bit_depth > 8
-+ vmov.i64 q12, #0
-+ vdup.16 q15, r4
-+ vmov.u16 q14, #2
-+.else
-+ vmov.u8 q15, #128
-+ vmov.u8 q14, #2
-+.endif
-+.endif
-+
-+@ If setup_64b we need q4-q7 saved.
-+.if \setup_64b
-+ vpush {q4-q8} @ 80 bytes, q8 pushed first
-+.set sp_base, sp_base + 80
-+.endif
-+
-+ ldr r6, [r6, lr, lsl #2]
-+
-+@ For 16 bit width 64 (or chroma 32) we need to do this in 2 passes
-+.if \do2
-+ push {r0, r1, r6, r12}
-+.if jent_pic
-+ bl 98f
-+.else
-+ blx r6
-+.endif
-+ pop {r0, r1, r6, r12}
-+
-+ add r0, #64
-+ add r1, #64
-+.endif
-+
-+.if jent_pic
-+ bl 98f
-+.else
-+ blx r6
-+.endif
-+
-+@ Tidy up & return
-+.if \setup_64b
-+ vpop {q4-q8} @ spurious but harmless load of q8
-+.endif
-+ pop {r4-r6, pc}
-+
-+.if jent_pic && !\xjump
-+@ Magic label - used as 98b in jent macro
-+98:
-+ add pc, r6
-+.endif
-+.endm
-+
-+
-+.macro edge_16b_init, bit_depth, is_chroma, check_w4, jump_tab
-+ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, check_w4=\check_w4, setup_16b=1
-+.endm
-+
-+.macro edge_64b_init, bit_depth, is_chroma, do2, jump_tab, xjump=0
-+ edge_xxb_init \bit_depth, \is_chroma, \jump_tab, do2=\do2, setup_64b=1, xjump=\xjump
-+.endm
-+
-+
-+.macro edge_64b_e0, body_fn, pb
-+ sub r1, #8
-+ mov r6, lr
-+1: vldm r1, {d7-d16}
-+ // load a
-+ vext.8 q0, q3, q4, #(16 - \pb)
-+ add r1, r3
-+ vext.8 q1, q4, q5, #(16 - \pb)
-+ subs r12, #1
-+ vext.8 q2, q5, q6, #(16 - \pb)
-+ vext.8 q3, q6, q7, #(16 - \pb)
-+ pld [r1]
-+ // load b
-+ vext.8 q11, q7, q8, #\pb @ Avoid overwrite
-+ pld [r1, #64]
-+ vext.8 q8, q4, q5, #\pb
-+ vext.8 q9, q5, q6, #\pb
-+ vext.8 q10, q6, q7, #\pb
-+ bl \body_fn
-+ vstm r0, {q0-q3}
-+ add r0, r0, r2
-+ bgt 1b
-+ bx r6
-+.endm
-+
-+.macro edge_32bx2_e0, body_fn, pb
-+ add r6, r1, r3
-+ push {r7,lr}
-+ sub r1, #8
-+ add r7, r0, r2
-+ lsl r2, #1
-+1: vldmia r1, {d7-d12}
-+ // load a
-+ vext.8 q0, q3, q4, #16 - \pb
-+ add r1, r1, r3, lsl #1
-+ vext.8 q1, q4, q5, #16 - \pb
-+ subs r12, #2
-+ // load b
-+ vext.8 q8, q4, q5, #\pb
-+ vext.8 q9, q5, q6, #\pb
-+ vldr d25, [r6, #-8]
-+ vldmia r6, {d12-d15}
-+ vldr d26, [r6, #32]
-+ // load a
-+ vext.8 q2, q12, q6, #16 - \pb
-+ add r6, r6, r3, lsl #1
-+ vext.8 q3, q6, q7, #16 - \pb
-+ // load b
-+ vext.8 q10, q6, q7, #\pb
-+ vext.8 q11, q7, q13, #\pb
-+ bl \body_fn
-+ vst1.8 {q0-q1}, [r0, :256], r2
-+ vst1.8 {q2-q3}, [r7, :256], r2
-+ bgt 1b
-+ pop {r7,pc}
-+.endm
-+
-+.macro edge_16b_e0, body_fn, pb
-+ sub r1, #8
-+ mov r6, lr
-+1: vldmia r1, {d1-d4}
-+ add r1, r3
-+ subs r12, #1
-+ vext.8 q0, q0, q1, #16 - \pb
-+ vext.8 q2, q1, q2, #\pb
-+
-+ bl \body_fn
-+ vst1.8 {q0}, [r0, :128], r2
-+ bgt 1b
-+ bx r6
-+.endm
-+
-+.macro edge_8bx2_e0, body_fn, pb
-+ add r6, r1, r3
-+ push {r7,lr}
-+ sub r1, #8
-+ add r7, r0, r2
-+ lsl r2, #1
-+1: vldmia r1, {d1-d2}
-+ vldmia r6, {d3-d4}
-+ vldr d6, [r1, #16]
-+ subs r12, #2
-+ vldr d7, [r6, #-8]
-+ add r1, r1, r3, lsl #1
-+ vext.8 d0, d1, d2, #8 - \pb
-+ add r6, r6, r3, lsl #1
-+ vext.8 d5, d3, d4, #\pb
-+ vext.8 d4, d2, d6, #\pb
-+ vext.8 d1, d7, d3, #8 - \pb
-+
-+ bl \body_fn
-+ vst1.8 {d0}, [r0, :64], r2
-+ vst1.8 {d1}, [r7, :64], r2
-+ bgt 1b
-+ pop {r7,pc}
-+.endm
-+
-+.macro edge_4bx4_e0, body_fn, pb
-+ add r6, r1, r3
-+ push {r7,lr}
-+ add r7, r0, r2
-+ lsl r2, #1
-+
-+ tst r1, #4
-+ bne 2f
-+1: // r1 (and assumed r6) are 64-bit aligned
-+ vldr d2, [r1]
-+ vldr d0, [r1, #-8]
-+ add r1, r1, r3, lsl #1
-+ vldr d20, [r6]
-+ subs r12, #4
-+ vldr d18, [r6, #-8]
-+ add r6, r6, r3, lsl #1
-+ vldr d3, [r1]
-+ vshr.u64 d4, d2, #\pb * 8
-+ vldr d1, [r1, #-8]
-+ add r1, r1, r3, lsl #1
-+ vldr d21, [r6]
-+ vext.8 d0, d0, d2, #8 - \pb
-+ vldr d19, [r6,#-8]
-+ add r6, r6, r3, lsl #1
-+ vshr.u64 d22, d20, #\pb * 8
-+ vext.8 d18, d18, d20, #8 - \pb
-+ vshr.u64 d5, d3, #\pb * 8
-+ vext.8 d1, d1, d3, #8 - \pb
-+ vshr.u64 d23, d21, #\pb * 8
-+ vext.8 d19, d19, d21, #8 - \pb
-+ vsli.64 q1, q10, #32
-+ vsli.64 q2, q11, #32
-+ vsli.64 q0, q9, #32
-+
-+ bl \body_fn
-+ vst1.32 {d0[0]}, [r0, :32], r2
-+ vst1.32 {d0[1]}, [r7, :32], r2
-+ vst1.32 {d1[0]}, [r0, :32], r2
-+ vst1.32 {d1[1]}, [r7, :32], r2
-+ bgt 1b
-+ pop {r7,pc}
-+
-+2: // r1 (and assumed r6) are 32-bit but not 64-bit aligned
-+ vldr d20, [r1, #-4]
-+ vldr d22, [r1, #4]
-+ add r1, r1, r3, lsl #1
-+ vldr d2, [r6, #-4]
-+ subs r12, #4
-+ vldr d4, [r6, #4]
-+ add r6, r6, r3, lsl #1
-+ vldr d21, [r1, #-4]
-+ vshl.i64 d18, d20, #\pb * 8
-+ vldr d23, [r1, #4]
-+ add r1, r1, r3, lsl #1
-+ vldr d3, [r6, #-4]
-+ vext.8 d22, d20, d22, #\pb
-+ vldr d5, [r6, #4]
-+ add r6, r6, r3, lsl #1
-+ vshl.i64 d0, d2, #\pb * 8
-+ vext.8 d4, d2, d4, #\pb
-+ vshl.i64 d19, d21, #\pb * 8
-+ vext.8 d23, d21, d23, #\pb
-+ vshl.i64 d1, d3, #\pb * 8
-+ vext.8 d5, d3, d5, #\pb
-+ vsri.64 q1, q10, #32
-+ vsri.64 q0, q9, #32
-+ vsri.64 q2, q11, #32
-+
-+ bl \body_fn
-+ vst1.32 {d0[0]}, [r0, :32], r2
-+ vst1.32 {d0[1]}, [r7, :32], r2
-+ vst1.32 {d1[0]}, [r0, :32], r2
-+ vst1.32 {d1[1]}, [r7, :32], r2
-+ bgt 2b
-+ pop {r7,pc}
-+.endm
-+
-+
-+.macro edge_64b_e1, body_fn
-+ sub r1, r3
-+ push {lr}
-+ add r6, r1, #32
-+ // load a
-+ vld1.8 {q0-q1}, [r1, :256], r3
-+ vld1.8 {q2-q3}, [r6, :256], r3
-+ // load c
-+ vld1.8 {q4-q5}, [r1, :256], r3
-+ vld1.8 {q6-q7}, [r6, :256], r3
-+1: // load b
-+ vld1.8 {q8-q9}, [r1, :256], r3
-+ subs r12, #1
-+ vld1.8 {q10-q11}, [r6, :256], r3
-+ bl \body_fn
-+ vstm r0, {q0-q3}
-+ // copy c to a
-+ vmov.64 q0, q4
-+ pld [r1, r3]
-+ vmov.64 q1, q5
-+ it le
-+ pople {lr}
-+ vmov.64 q2, q6
-+ it le
-+ bxle lr
-+ vmov.64 q3, q7
-+ add r0, r0, r2
-+ // copy b to c
-+ vmov.64 q4, q8
-+ vmov.64 q5, q9
-+ vmov.64 q6, q10
-+ vmov.64 q7, q11
-+ b 1b
-+.endm
-+
-+.macro edge_32bx2_e1, body_fn
-+ sub r6, r1, r3
-+ vld1.8 {q2-q3}, [r1, :256], r3
-+ vld1.8 {q0-q1}, [r6, :256]
-+ mov r6, lr
-+
-+1: @ Given the data duplication here we could obviously do better than
-+ @ using the generic body_fn but it almost certainly isn't worth it
-+ vld1.8 {q8-q9}, [r1, :256], r3
-+ subs r12, #2
-+ vmov q4, q2
-+ vmov q5, q3
-+ vld1.8 {q10-q11}, [r1, :256], r3
-+ vmov q6, q8
-+ vmov q7, q9
-+
-+ bl \body_fn
-+
-+ vst1.8 {q0-q1}, [r0, :256], r2
-+ // copy b to a
-+ vmov q0, q8
-+ vmov q1, q9
-+ vst1.8 {q2-q3}, [r0, :256], r2
-+ vmov q2, q10
-+ it le
-+ bxle r6
-+ vmov q3, q11
-+ b 1b
-+.endm
-+
-+.macro edge_16b_e1, body_fn
-+ sub r6, r1, r3
-+ // load c
-+ vld1.8 {q1}, [r1, :128], r3
-+ // load a
-+ vld1.8 {q0}, [r6, :128]
-+ mov r6, lr
-+1: // load b
-+ vld1.8 {q2}, [r1, :128], r3
-+ bl \body_fn
-+ vst1.8 {q0}, [r0, :128], r2
-+ subs r12, #1
-+ // copy c to a
-+ vmov.64 q0, q1
-+ it le
-+ bxle r6
-+ // copy b to c
-+ vmov.64 q1, q2
-+ b 1b
-+.endm
-+
-+.macro edge_8bx2_e1, body_fn
-+ sub r6, r1, r3
-+ lsl r3, #1
-+ push {r7, lr}
-+ vld1.8 {d1}, [r1, :64], r3
-+ vld1.8 {d0}, [r6, :64], r3
-+ add r7, r0, r2
-+ lsl r2, #1
-+1: @ Given the data duplication here we could obviously do better than
-+ @ using the generic body_fn but it almost certainly isn't worth it
-+ vld1.8 {d4}, [r6, :64], r3
-+ vmov d2, d1
-+ vld1.8 {d5}, [r1, :64], r3
-+ subs r12, #2
-+ vmov d3, d4
-+
-+ bl \body_fn
-+
-+ vst1.8 {d0}, [r0, :64], r2
-+ vst1.8 {d1}, [r7, :64], r2
-+
-+ // copy b to a
-+ vmov q0, q2
-+ bgt 1b
-+ pop {r7, pc}
-+.endm
-+
-+.macro edge_4bx4_e1, body_fn
-+ sub r6, r1, r3
-+ lsl r3, #1
-+ push {r7, lr}
-+ vld1.32 {d0[1]}, [r1, :32], r3
-+ add r7, r0, r2
-+ vld1.32 {d0[0]}, [r6, :32], r3
-+ lsl r2, #1
-+ vld1.32 {d4[1]}, [r1, :32], r3
-+ vld1.32 {d4[0]}, [r6, :32], r3
-+ vld1.32 {d5[1]}, [r1, :32], r3
-+ vld1.32 {d5[0]}, [r6, :32], r3
-+ vmov d1, d4
-+ vext.32 d2, d0, d4, #1
-+ subs r12, #4
-+ vmov d22, d5
-+ vext.32 d3, d4, d5, #1
-+ b 2f
-+
-+1: vst1.32 {d0[0]}, [r0, :32], r2
-+ vext.32 d2, d22, d4, #1
-+ vst1.32 {d0[1]}, [r7, :32], r2
-+ vmov d0, d22
-+ vst1.32 {d1[0]}, [r0, :32], r2
-+ vext.32 d3, d4, d5, #1
-+ vst1.32 {d1[1]}, [r7, :32], r2
-+ vmov d1, d4
-+ vmov d22, d5
-+2: @ Given the data duplication here we could probably do better than
-+ @ using the generic body_fn but it almost certainly isn't worth it
-+ bl \body_fn
-+ ble 3f
-+ vld1.32 {d4[0]}, [r6, :32], r3
-+ subs r12, #4
-+ vld1.32 {d4[1]}, [r1, :32], r3
-+ vld1.32 {d5[0]}, [r6, :32], r3
-+ vld1.32 {d5[1]}, [r1, :32], r3
-+ b 1b
-+
-+3: vst1.32 {d0[0]}, [r0, :32], r2
-+ vst1.32 {d0[1]}, [r7, :32], r2
-+ vst1.32 {d1[0]}, [r0, :32]
-+ vst1.32 {d1[1]}, [r7, :32]
-+ pop {r7, pc}
-+.endm
-+
-+.macro edge_64b_e2, body_fn, pb
-+ push {lr}
-+ sub r6, r1, r3
-+ // load c and a
-+ vld1.8 {q4-q5}, [r1, :128]
-+ vldr d25, [r6, #-8]
-+ vldmia r6, {d16-d23}
-+ vext.8 q0, q12, q8, #16 - \pb
-+ add r6, r1, #32
-+ vext.8 q1, q8, q9, #16 - \pb
-+ add r1, r1, r3
-+ vext.8 q2, q9, q10, #16 - \pb
-+ vld1.8 {q6-q7}, [r6, :128]
-+ sub r6, r1, r3
-+ vext.8 q3, q10, q11, #16 - \pb
-+
-+1: // load b
-+ vldmia r1, {d16-d24}
-+ vext.8 q8, q8, q9, #\pb
-+ pld [r1, r3]
-+ vext.8 q9, q9, q10, #\pb
-+ subs r12, #1
-+ vext.8 q10, q10, q11, #\pb
-+ vext.8 q11, q11, q12, #\pb
-+ bl \body_fn
-+ // next a is mostly available in c
-+ vldr d25, [r6, #-8]
-+ vstmia r0, {q0-q3}
-+ vext.8 q3, q6, q7, #16 - \pb
-+ it le
-+ pople {lr}
-+ vext.8 q2, q5, q6, #16 - \pb
-+ it le
-+ bxle lr
-+ vext.8 q1, q4, q5, #16 - \pb
-+ add r6, r6, r3
-+ vext.8 q0, q12, q4, #16 - \pb
-+ add r0, r0, r2
-+ // next c is mostly available in b
-+ vldr d8, [r1]
-+ vext.8 d9, d16, d17, #8 - \pb
-+ vext.8 q5, q8, q9, #16 - \pb
-+ add r1, r1, r3
-+ vext.8 q6, q9, q10, #16 - \pb
-+ pld [r6, #-8]
-+ vext.8 q7, q10, q11, #16 - \pb
-+ b 1b
-+.endm
-+
-+.macro edge_32bx2_e2, body_fn, pb
-+ sub r6, r1, r3
-+ push {r7, lr}
-+ add r7, r0, r2
-+ lsl r2, #1
-+ // load a and first 32b of c
-+ vld1.8 {q4-q5}, [r1, :256]
-+ vldr d25, [r6, #-8]
-+ vld1.8 {q13-q14}, [r6, :256]
-+ vldr d31, [r1, #-8]
-+ add r6, r6, r3, lsl #1
-+ vext.8 q0, q12, q13, #16 - \pb
-+ add r1, r1, r3, lsl #1
-+ vext.8 q1, q13, q14, #16 - \pb
-+ vext.8 q2, q15, q4, #16 - \pb
-+ vext.8 q3, q4, q5, #16 - \pb
-+1:
-+ // load second 32b of c and second 32b of b
-+ vldmia r6, {d12-d16}
-+ vldmia r1, {d20-d24}
-+ // first 32b of b is mostly available in second 32b of c
-+ vext.8 q9, q7, q8, #\pb
-+ subs r12, #2
-+ vext.8 q8, q6, q7, #\pb
-+ vext.8 q10, q10, q11, #\pb
-+ vext.8 q11, q11, q12, #\pb
-+
-+ bl \body_fn
-+
-+ vst1.8 {q0-q1}, [r0, :256], r2
-+ vst1.8 {q2-q3}, [r7, :256], r2
-+ ble 2f
-+
-+ vldr d25, [r6, #-8]
-+ add r6, r6, r3, lsl #1
-+ vldr d8, [r1]
-+ vext.8 d9, d20, d21, #8 - \pb
-+ vldr d31, [r1, #-8]
-+ add r1, r1, r3, lsl #1
-+ // first 32b of a is mostly available in second 32b of c
-+ vext.8 q1, q6, q7, #16 - \pb
-+ vext.8 q0, q12, q6, #16 - \pb
-+ // first 32b of c is mostly available in second 32b of b
-+ vext.8 q5, q10, q11, #16 - \pb
-+ // second 32b of a is mostly available in first 32b of c
-+ vext.8 q2, q15, q4, #16 - \pb
-+ vext.8 q3, q4, q5, #16 - \pb
-+ b 1b
-+
-+2: pop {r7, pc}
-+.endm
-+
-+.macro edge_16b_e2, body_fn, pb
-+ push {lr}
-+ sub r6, r1, r3
-+ vld1.8 {q1}, [r1, :128], r3
-+ vldr d19, [r6, #-8]
-+ vld1.8 {q10}, [r6, :128], r3
-+
-+1: vldmia r1, {d4-d6}
-+ vext.8 q0, q9, q10, #16 - \pb
-+ subs r12, #1
-+ vext.8 q2, q2, q3, #\pb
-+ bl \body_fn
-+ vst1.8 {q0}, [r0, :128], r2
-+ ble 2f
-+ vmov q10, q1
-+ vldr d2, [r1]
-+ add r1, r1, r3
-+ vldr d19, [r6, #-8]
-+ add r6, r6, r3
-+ vext.8 d3, d4, d5, #8 - \pb
-+ b 1b
-+
-+2: pop {pc}
-+.endm
-+
-+.macro edge_8bx2_e2, body_fn, pb
-+ sub r6, r1, r3
-+ push {r7, lr}
-+ add r7, r0, r2
-+ lsl r2, #1
-+ vldr d18, [r6, #-8]
-+ vldr d19, [r6]
-+ add r6, r6, r3, lsl #1
-+ vldr d20, [r1, #-8]
-+ vldr d2, [r1]
-+ add r1, r1, r3, lsl #1
-+ vldmia r6, {d3-d4}
-+ vld1.8 {d21-d22}, [r1, :128]
-+
-+1: vext.8 d0, d18, d19, #8 - \pb
-+ vext.8 d4, d3, d4, #\pb
-+ vext.8 d1, d20, d2, #8 - \pb
-+ subs r12, #2
-+ vext.8 d5, d21, d22, #\pb
-+
-+ bl \body_fn
-+
-+ vst1.8 {d0}, [r0, :64], r2
-+ vst1.8 {d1}, [r7, :64], r2
-+ ble 2f
-+
-+ vldr d18, [r6, #-8]
-+ add r6, r6, r3, lsl #1
-+ vldr d20, [r1, #-8]
-+ vmov d19, d3
-+ vldr d2, [r1]
-+ add r1, r1, r3, lsl #1
-+ vldmia r6, {d3-d4}
-+ vld1.8 {d21-d22}, [r1, :128]
-+ b 1b
-+
-+2: pop {r7, pc}
-+.endm
-+
-+.macro edge_4bx4_e2, body_fn, pb
-+ sub r6, r1, r3
-+ push {r7-r9, lr}
-+ add r8, r1, r3
-+ sub r6, r6, #\pb
-+ add r8, r8, #\pb
-+ add r7, r0, r2
-+ lsl r2, #1
-+
-+1: vld1.32 {d0[0]}, [r6], r3
-+ subs r12, #4
-+ vld1.32 {d2[0]}, [r1], r3
-+ vld1.32 {d4[0]}, [r8], r3
-+ vld1.32 {d0[1]}, [r6], r3
-+ vld1.32 {d2[1]}, [r1], r3
-+ vld1.32 {d4[1]}, [r8], r3
-+ vld1.32 {d1[0]}, [r6], r3
-+ vld1.32 {d3[0]}, [r1], r3
-+ vld1.32 {d5[0]}, [r8], r3
-+ vld1.32 {d1[1]}, [r6], r3
-+ vld1.32 {d3[1]}, [r1], r3
-+ vld1.32 {d5[1]}, [r8], r3
-+
-+ bl \body_fn
-+
-+ vst1.32 {d0[0]}, [r0, :32], r2
-+ vst1.32 {d0[1]}, [r7, :32], r2
-+ vst1.32 {d1[0]}, [r0, :32], r2
-+ vst1.32 {d1[1]}, [r7, :32], r2
-+ bgt 1b
-+
-+ pop {r7-r9,pc}
-+.endm
-+
-+.macro edge_64b_e3, body_fn, pb
-+ push {lr}
-+ sub r6, r1, r3
-+ // load c and a
-+ vld1.8 {q4-q5}, [r1, :128]
-+ vldmia r6, {d16-d24}
-+ vext.8 q0, q8, q9, #\pb
-+ add r6, r1, #32
-+ vext.8 q1, q9, q10, #\pb
-+ add r1, r1, r3
-+ vext.8 q2, q10, q11, #\pb
-+ vld1.8 {q6-q7}, [r6, :128]
-+ sub r6, r1, r3
-+ vext.8 q3, q11, q12, #\pb
-+
-+1: // load b
-+ vldr d17, [r1, #-8]
-+ vldmia r1, {d18-d25}
-+ vext.8 q8, q8, q9, #16 - \pb
-+ pld [r1, r3]
-+ vext.8 q9, q9, q10, #16 - \pb
-+ subs r12, #1
-+ vext.8 q10, q10, q11, #16 - \pb
-+ vext.8 q11, q11, q12, #16 - \pb
-+ bl \body_fn
-+ // next a is mostly available in c
-+ vldr d24, [r6, #64]
-+ vstmia r0, {q0-q3}
-+ vext.8 q0, q4, q5, #\pb
-+ it le
-+ pople {lr}
-+ vext.8 q1, q5, q6, #\pb
-+ it le
-+ bxle lr
-+ vext.8 q2, q6, q7, #\pb
-+ add r6, r6, r3
-+ vext.8 q3, q7, q12, #\pb
-+ add r0, r0, r2
-+ // next c is mostly available in b
-+ vext.8 d14, d22, d23, #\pb
-+ vldr d15, [r1, #56]
-+ vext.8 q4, q8, q9, #\pb
-+ add r1, r1, r3
-+ vext.8 q5, q9, q10, #\pb
-+ vext.8 q6, q10, q11, #\pb
-+ b 1b
-+.endm
-+
-+.macro edge_32bx2_e3, body_fn, pb
-+ sub r6, r1, r3
-+ push {r7, lr}
-+ add r7, r0, r2
-+ lsl r2, #1
-+ // load a and first 32b of c
-+ vldmia r1, {d8-d12}
-+ vldmia r6, {d24-d28}
-+ vext.8 q2, q4, q5, #\pb
-+ add r6, r6, r3, lsl #1
-+ vext.8 q3, q5, q6, #\pb
-+ add r1, r1, r3, lsl #1
-+ vext.8 q0, q12, q13, #\pb
-+ vext.8 q1, q13, q14, #\pb
-+1:
-+ // load second 32b of c and second 32b of b
-+ vldr d25, [r6, #-8]
-+ subs r12, #2
-+ vldmia r6, {d12-d15}
-+ vldr d27, [r1, #-8]
-+ vldmia r1, {d20-d23}
-+ // first 32b of b is mostly available in second 32b of c
-+ vext.8 q8, q12, q6, #16 - \pb
-+ vext.8 q9, q6, q7, #16 - \pb
-+ vext.8 q11, q10, q11, #16 - \pb
-+ vext.8 q10, q13, q10, #16 - \pb
-+
-+ bl \body_fn
-+
-+ vst1.8 {q0-q1}, [r0, :256], r2
-+ vst1.8 {q2-q3}, [r7, :256], r2
-+ ble 2f
-+
-+ vldr d24, [r6, #32]
-+ add r6, r6, r3, lsl #1
-+ vldr d11, [r1, #24]
-+ vext.8 d10, d22, d23, #\pb
-+ vldr d30, [r1, #32]
-+ add r1, r1, r3, lsl #1
-+ // first 32b of a is mostly available in second 32b of c
-+ vext.8 q0, q6, q7, #\pb
-+ vext.8 q1, q7, q12, #\pb
-+ // first 32b of c is mostly available in second 32b of b
-+ vext.8 q4, q10, q11, #\pb
-+ // second 32b of a is mostly available in first 32b of c
-+ vext.8 q3, q5, q15, #\pb
-+ vext.8 q2, q4, q5, #\pb
-+ b 1b
-+
-+2: pop {r7, pc}
-+.endm
-+
-+.macro edge_16b_e3, body_fn, pb
-+ push {lr}
-+ sub r6, r1, r3
-+ vld1.8 {q1}, [r1, :128], r3
-+ vldmia r6, {d18-d20}
-+ add r6, r6, r3
-+
-+1: vldr d5, [r1, #-8]
-+ vld1.8 {q3}, [r1, :128]
-+ subs r12, #1
-+ vext.8 q0, q9, q10, #\pb
-+ vext.8 q2, q2, q3, #16 - \pb
-+ bl \body_fn
-+ vst1.8 {q0}, [r0, :128], r2
-+ ble 2f
-+ vmov q9, q1
-+ vldr d3, [r1, #8]
-+ add r1, r1, r3
-+ vldr d20, [r6, #16]
-+ add r6, r6, r3
-+ vext.8 d2, d4, d5, #\pb
-+ b 1b
-+
-+2: pop {pc}
-+.endm
-+
-+.macro edge_8bx2_e3, body_fn, pb
-+ sub r6, r1, r3
-+ push {r7, lr}
-+ add r7, r0, r2
-+ lsl r2, #1
-+ vld1.8 {d18-d19}, [r6]
-+ add r6, r6, r3, lsl #1
-+ vldr d20, [r1, #8]
-+ vldr d2, [r1]
-+ add r1, r1, r3, lsl #1
-+ vldr d4, [r6, #-8]
-+ vldr d3, [r6]
-+ vldr d21, [r1, #-8]
-+ vldr d22, [r1]
-+
-+1: vext.8 d0, d18, d19, #\pb
-+ vext.8 d4, d4, d3, #8 - \pb
-+ vext.8 d1, d2, d20, #\pb
-+ subs r12, #2
-+ vext.8 d5, d21, d22, #8 - \pb
-+
-+ bl \body_fn
-+
-+ vst1.8 {d0}, [r0, :64], r2
-+ vst1.8 {d1}, [r7, :64], r2
-+ ble 2f
-+
-+ vldr d19, [r6, #8]
-+ add r6, r6, r3, lsl #1
-+ vldr d20, [r1, #8]
-+ vmov d18, d3
-+ vldr d2, [r1]
-+ add r1, r1, r3, lsl #1
-+ vldr d4, [r6, #-8]
-+ vldr d3, [r6]
-+ vldr d21, [r1, #-8]
-+ vldr d22, [r1]
-+ b 1b
-+
-+2: pop {r7, pc}
-+.endm
-+
-+.macro edge_4bx4_e3, body_fn, pb
-+ @ e3 is the same as e2 but with the X offset reversed
-+ edge_4bx4_e2 \body_fn, (-\pb)
-+.endm
-+
-+@ Jump table entry - if in neon mode the bottom bit must be set
-+@ ? There is probably a real asm instruction to do this but I haven't found it
-+.macro jent lab
-+.if jent_pic
-+@ Could use .short here but due to A32 not supporting ldrh [lsl#1] it is
-+@ simpler and clearer in the code to stick with .word
-+T .word (0 + \lab) - (4 + 98b)
-+A .word (0 + \lab) - (8 + 98b)
-+.else
-+T .word 1 + \lab
-+A .word \lab
-+.endif
-+.endm
-+
-+.macro edge_64b_bodies, body_fn, pb
-+ jent 0f
-+ jent 10f
-+ jent 20f
-+ jent 30f
-+
-+0: edge_64b_e0 \body_fn, \pb
-+10: edge_64b_e1 \body_fn
-+20: edge_64b_e2 \body_fn, \pb
-+30: edge_64b_e3 \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_bodies, body_fn, pb
-+ jent 0f
-+ jent 10f
-+ jent 20f
-+ jent 30f
-+
-+0: edge_32bx2_e0 \body_fn, \pb
-+10: edge_32bx2_e1 \body_fn
-+20: edge_32bx2_e2 \body_fn, \pb
-+30: edge_32bx2_e3 \body_fn, \pb
-+.endm
-+
-+.macro edge_16b_bodies, body_fn, pb
-+ jent 0f
-+ jent 10f
-+ jent 20f
-+ jent 30f
-+
-+0: edge_16b_e0 \body_fn, \pb
-+10: edge_16b_e1 \body_fn
-+20: edge_16b_e2 \body_fn, \pb
-+30: edge_16b_e3 \body_fn, \pb
-+.endm
-+
-+.macro edge_32bx2_16b_bodies, body_fn_64b, body_fn_16b, pb
-+ jent 0f
-+ jent 10f
-+ jent 20f
-+ jent 30f
-+ jent 5f
-+ jent 15f
-+ jent 25f
-+ jent 35f
-+
-+0: edge_32bx2_e0 \body_fn_64b, \pb
-+10: edge_32bx2_e1 \body_fn_64b
-+20: edge_32bx2_e2 \body_fn_64b, \pb
-+30: edge_32bx2_e3 \body_fn_64b, \pb
-+5: edge_16b_e0 \body_fn_16b, \pb
-+15: edge_16b_e1 \body_fn_16b
-+25: edge_16b_e2 \body_fn_16b, \pb
-+35: edge_16b_e3 \body_fn_16b, \pb
-+.endm
-+
-+.macro edge_16b_8bx2_bodies, body_fn, pb
-+ jent 0f
-+ jent 10f
-+ jent 20f
-+ jent 30f
-+ jent 5f
-+ jent 15f
-+ jent 25f
-+ jent 35f
-+
-+0: edge_16b_e0 \body_fn, \pb
-+10: edge_16b_e1 \body_fn
-+20: edge_16b_e2 \body_fn, \pb
-+30: edge_16b_e3 \body_fn, \pb
-+5: edge_8bx2_e0 \body_fn, \pb
-+15: edge_8bx2_e1 \body_fn
-+25: edge_8bx2_e2 \body_fn, \pb
-+35: edge_8bx2_e3 \body_fn, \pb
-+.endm
-+
-+.macro edge_8bx2_4bx4_bodies, body_fn, pb
-+ jent 0f
-+ jent 10f
-+ jent 20f
-+ jent 30f
-+ jent 5f
-+ jent 15f
-+ jent 25f
-+ jent 35f
-+
-+0: edge_8bx2_e0 \body_fn, \pb
-+10: edge_8bx2_e1 \body_fn
-+20: edge_8bx2_e2 \body_fn, \pb
-+30: edge_8bx2_e3 \body_fn, \pb
-+5: edge_4bx4_e0 \body_fn, \pb
-+15: edge_4bx4_e1 \body_fn
-+25: edge_4bx4_e2 \body_fn, \pb
-+35: edge_4bx4_e3 \body_fn, \pb
-+.endm
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_8, export=1
-+ edge_16b_init 8, 0, 1, 99f
-+99:
-+ edge_8bx2_4bx4_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_8, export=1
-+ edge_16b_init 8, 0, 0, 99f
-+99:
-+ edge_16b_bodies edge_16b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_8, export=1
-+ edge_64b_init 8, 0, 0, 99f
-+99:
-+ edge_32bx2_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_64_neon_8, export=1
-+ edge_64b_init 8, 0, 0, 99f
-+99:
-+ edge_64b_bodies edge_64b_body_8, 1
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0]
-+@ int eo, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_8, export=1
-+ edge_16b_init 8, 1, 1, 99f
-+99:
-+ edge_16b_8bx2_bodies edge_16b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0]
-+@ int eo, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_8, export=1
-+ edge_64b_init 8, 1, 0, 99f
-+99:
-+ edge_32bx2_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_8(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0]
-+@ int eo, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_8, export=1
-+ edge_64b_init 8, 1, 0, 99f
-+99:
-+ edge_64b_bodies edge_64b_body_8, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_8_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_8_neon_10, export=1
-+ edge_16b_init 10, 0, 1, 99f
-+99:
-+ edge_16b_8bx2_bodies edge_16b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_16_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_16_neon_10, export=1
-+ edge_64b_init 10, 0, 0, 99f
-+99:
-+ edge_32bx2_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_64_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+@ We simply split the 32 case into 2 vertical stripes
-+@ and call the fns for w32
-+@
-+@ Calling code will always have src != dst so we don't have to worry
-+@ about edge effects
-+
-+function ff_hevc_rpi_sao_edge_64_neon_10, export=1
-+ edge_64b_init 10, 0, 1, 99f, xjump=1
-+endfunc
-+
-+@ void ff_hevc_rpi_sao_edge_32_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ uint8_t *_src, [r1]
-+@ int stride_dst, [r2]
-+@ int16_t *_sao_offset_val, [r3]
-+@ int eo, [sp, #0]
-+@ int width, [sp, #4]
-+@ int height) [sp, #8]
-+
-+function ff_hevc_rpi_sao_edge_32_neon_10, export=1
-+ edge_64b_init 10, 0, 0, 99f
-+99:
-+ edge_64b_bodies edge_64b_body_16, 2
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_8_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0]
-+@ int eo, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_8_neon_10, export=1
-+ edge_xxb_init 10, 1, 99f, check_w4=1, setup_16b=1, setup_64b=1
-+99:
-+ edge_32bx2_16b_bodies edge_64b_body_16, edge_16b_body_16, 4
-+endfunc
-+
-+@ ff_hevc_rpi_sao_edge_c_32_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0]
-+@ int eo, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_32_neon_10, export=1
-+ edge_64b_init 10, 1, 1, 99f, xjump=1
-+endfunc
-+
-+
-+@ ff_hevc_rpi_sao_edge_c_16_neon_10(
-+@ uint8_t *_dst, [r0]
-+@ const uint8_t *_src, [r1]
-+@ ptrdiff_t stride_dst, [r2]
-+@ const int16_t *_sao_offset_val_u, [r3]
-+@ const int16_t *_sao_offset_val_v, [sp, #0]
-+@ int eo, [sp, #4]
-+@ int width, [sp, #8]
-+@ int height) [sp, #12]
-+
-+function ff_hevc_rpi_sao_edge_c_16_neon_10, export=1
-+ edge_64b_init 10, 1, 0, 99f
-+99:
-+ edge_64b_bodies edge_64b_body_16, 4
-+endfunc
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_arm.h b/libavcodec/arm/rpi_hevcpred_arm.h
-new file mode 100644
-index 0000000000..36a23a5bf9
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_arm.h
-@@ -0,0 +1,28 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_ARM_HEVCPRED_ARM_H
-+#define AVCODEC_ARM_HEVCPRED_ARM_H
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+
-+void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth);
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth);
-+
-+#endif /* AVCODEC_ARM_HEVCPRED_ARM_H */
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_init_arm.c b/libavcodec/arm/rpi_hevcpred_init_arm.c
-new file mode 100644
-index 0000000000..80724d4cf3
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_arm.c
-@@ -0,0 +1,35 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/cpu.h"
-+#include "libavutil/arm/cpu.h"
-+
-+#include "libavcodec/rpi_hevcpred.h"
-+#include "rpi_hevcpred_arm.h"
-+
-+av_cold void ff_hevc_rpi_pred_init_arm(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+ int cpu_flags = av_get_cpu_flags();
-+
-+ if (have_neon(cpu_flags))
-+ ff_hevc_rpi_pred_init_neon(c, bit_depth);
-+}
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_init_neon.c b/libavcodec/arm/rpi_hevcpred_init_neon.c
-new file mode 100644
-index 0000000000..21e7700174
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_init_neon.c
-@@ -0,0 +1,210 @@
-+/*
-+ * Copyright (c) 2018 John Cox (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcpred_arm.h"
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_32;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+void ff_hevc_rpi_pred_angular_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_vertical_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_vertical_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_horizontal_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+void ff_hevc_rpi_pred_horizontal_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+void ff_hevc_rpi_pred_planar_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_planar_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_dc_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_4_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_8_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+void ff_hevc_rpi_pred_dc_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+
-+void ff_hevc_rpi_pred_init_neon(HEVCRpiPredContext * const c, const int bit_depth)
-+{
-+ switch (bit_depth)
-+ {
-+ case 8:
-+ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_8;
-+ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_8;
-+ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_16; // Equivalent to c_4_neon_8
-+ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+
-+ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_8;
-+ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_8;
-+ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_8;
-+ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_8;
-+ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_8;
-+ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_8;
-+ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_8;
-+
-+ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_8;
-+ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_8;
-+ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_8;
-+ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_8;
-+ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_8;
-+ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_8;
-+ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_8;
-+
-+ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_8;
-+ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_8;
-+ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_8;
-+ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_8;
-+ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_8;
-+ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_8;
-+ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_8;
-+
-+ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_8;
-+ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_8;
-+ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_8;
-+ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_8;
-+ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_8;
-+ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_8;
-+ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_8;
-+
-+ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_8;
-+ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_8;
-+ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_8;
-+ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_8;
-+ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_8;
-+ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_8;
-+ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_8;
-+ break;
-+ case 10:
-+ c->intra_filter[0] = ff_hevc_rpi_intra_filter_4_neon_16;
-+ c->intra_filter[1] = ff_hevc_rpi_intra_filter_8_neon_16;
-+ c->intra_filter[2] = ff_hevc_rpi_intra_filter_16_neon_16;
-+ c->intra_filter_c[0] = ff_hevc_rpi_intra_filter_4_neon_32;
-+ c->intra_filter_c[1] = ff_hevc_rpi_intra_filter_8_neon_32;
-+ c->intra_filter_c[2] = ff_hevc_rpi_intra_filter_16_neon_32;
-+
-+ c->pred_angular[0] = ff_hevc_rpi_pred_angular_4_neon_10;
-+ c->pred_angular[1] = ff_hevc_rpi_pred_angular_8_neon_10;
-+ c->pred_angular[2] = ff_hevc_rpi_pred_angular_16_neon_10;
-+ c->pred_angular[3] = ff_hevc_rpi_pred_angular_32_neon_10;
-+ c->pred_angular_c[0] = ff_hevc_rpi_pred_angular_c_4_neon_10;
-+ c->pred_angular_c[1] = ff_hevc_rpi_pred_angular_c_8_neon_10;
-+ c->pred_angular_c[2] = ff_hevc_rpi_pred_angular_c_16_neon_10;
-+
-+ c->pred_horizontal[0] = ff_hevc_rpi_pred_horizontal_4_neon_10;
-+ c->pred_horizontal[1] = ff_hevc_rpi_pred_horizontal_8_neon_10;
-+ c->pred_horizontal[2] = ff_hevc_rpi_pred_horizontal_16_neon_10;
-+ c->pred_horizontal[3] = ff_hevc_rpi_pred_horizontal_32_neon_10;
-+ c->pred_horizontal_c[0] = ff_hevc_rpi_pred_horizontal_c_4_neon_10;
-+ c->pred_horizontal_c[1] = ff_hevc_rpi_pred_horizontal_c_8_neon_10;
-+ c->pred_horizontal_c[2] = ff_hevc_rpi_pred_horizontal_c_16_neon_10;
-+
-+ c->pred_vertical[0] = ff_hevc_rpi_pred_vertical_4_neon_10;
-+ c->pred_vertical[1] = ff_hevc_rpi_pred_vertical_8_neon_10;
-+ c->pred_vertical[2] = ff_hevc_rpi_pred_vertical_16_neon_10;
-+ c->pred_vertical[3] = ff_hevc_rpi_pred_vertical_32_neon_10;
-+ c->pred_vertical_c[0] = ff_hevc_rpi_pred_vertical_c_4_neon_10;
-+ c->pred_vertical_c[1] = ff_hevc_rpi_pred_vertical_c_8_neon_10;
-+ c->pred_vertical_c[2] = ff_hevc_rpi_pred_vertical_c_16_neon_10;
-+
-+ c->pred_planar[0] = ff_hevc_rpi_pred_planar_4_neon_10;
-+ c->pred_planar[1] = ff_hevc_rpi_pred_planar_8_neon_10;
-+ c->pred_planar[2] = ff_hevc_rpi_pred_planar_16_neon_10;
-+ c->pred_planar[3] = ff_hevc_rpi_pred_planar_32_neon_10;
-+ c->pred_planar_c[0] = ff_hevc_rpi_pred_planar_c_4_neon_10;
-+ c->pred_planar_c[1] = ff_hevc_rpi_pred_planar_c_8_neon_10;
-+ c->pred_planar_c[2] = ff_hevc_rpi_pred_planar_c_16_neon_10;
-+
-+ c->pred_dc[0] = ff_hevc_rpi_pred_dc_4_neon_10;
-+ c->pred_dc[1] = ff_hevc_rpi_pred_dc_8_neon_10;
-+ c->pred_dc[2] = ff_hevc_rpi_pred_dc_16_neon_10;
-+ c->pred_dc[3] = ff_hevc_rpi_pred_dc_32_neon_10;
-+ c->pred_dc_c[0] = ff_hevc_rpi_pred_dc_c_4_neon_10;
-+ c->pred_dc_c[1] = ff_hevc_rpi_pred_dc_c_8_neon_10;
-+ c->pred_dc_c[2] = ff_hevc_rpi_pred_dc_c_16_neon_10;
-+ break;
-+ default:
-+ break;
-+ }
-+}
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-new file mode 100644
-index 0000000000..3dd9246a16
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_angular_neon.S
-@@ -0,0 +1,2975 @@
-+/*
-+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/*
-+ * General angular pred
-+ *
-+ * Horizontal (10) & Vertical (26) cases have their own file
-+ * and are not dealt with properly here (luma filtering is missing)
-+ *
-+ * The inv_angle calculations are annoying - if it wasn't for the +128
-+ * rounding step then the result would simply be the loop counter :-(
-+ */
-+
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+.text
-+
-+@ Horizontal Patch functions
-+@ These need a transpose before store so exist as smaller patches
-+@ Patches can be called repeatedly without any intermediate setup
-+@ to generate a horizontal block
-+@
-+@ It is almost certainly the case that larger patch fns can be built
-+@ and they would be a little faster, but we would still need the small
-+@ fns and code size (or at least instruction cache size) is an issue
-+@ given how much code we already have here
-+
-+@ Generate 8x8 luma 8 patch
-+@
-+@ r3 Out stride
-+@ r4 Angle add
-+@ r7 Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2 Left ptr - updated
-+@ r10 Inv angle accumulator (_up only)
-+@ r12 32 - angle frac (_down) or angle frac (_up)
-+@ d0 Older reference samples
-+@ d1=r8+r9 Newer reference samples
-+@ d2 32 - angle frac
-+@ d3 Angle frac
-+@ q2 Partially computed next result (_up only)
-+@
-+@ Temps
-+@ r5 Loop counter
-+@ r6
-+@ r7 (_down only)
-+@ r11 (_up only)
-+@ q2, q8-q11
-+
-+patch_h_down_8x8_8:
-+ ldrd r8, r9, [r2] @ Left
-+ rsb r12, r6, #32
-+ vmov d0, r8, r9
-+ vdup.8 d3, r6
-+ lsr r8, #8
-+ vdup.8 d2, r12
-+ orr r8, r8, r9, lsl #24
-+ ldr r9, [r2, #5]!
-+ vmov d1, r8, r9
-+ // drop through...
-+patch_h_down_8x8_8_continue:
-+ mov r5, #8
-+1:
-+ subs r12, r4
-+ vmull.u8 q2, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmlal.u8 q2, d1, d3
-+ rsb r6, r12, #32
-+ vext.8 q8, q8, q9, #8
-+ itt mi
-+ lsrmi r7, r8, #8
-+ vmovmi d0, r8, r9
-+ vdup.8 d2, r12
-+ vext.8 q9, q9, q10, #8
-+ it mi
-+ orrmi r8, r7, r9, lsl #24
-+ vext.8 q10, q10, q11, #8
-+ it mi
-+ ldrmi r9, [r2, #1]!
-+ vmov d22, d23
-+ vrshrn.u16 d23, q2, #5
-+ it mi
-+ vmovmi d1, r8, r9
-+ subs r5, #1
-+ vdup.8 d3, r6
-+ bne 1b
-+ // drop through...
-+store_tran_8x8_8:
-+ vzip.8 d16, d17
-+ add r6, r0, r3
-+ vzip.8 d18, d19
-+ lsl r3, #1
-+ vzip.8 d20, d21
-+ add r5, r0, r3
-+ vzip.8 d22, d23
-+ vzip.16 q8, q9
-+ vzip.16 q10, q11
-+ vzip.32 q8, q10
-+ vzip.32 q9, q11
-+ vst1.8 {d16}, [r0]!
-+ vst1.8 {d17}, [r6], r3
-+ vst1.8 {d20}, [r5], r3
-+ vst1.8 {d21}, [r6], r3
-+ vst1.8 {d18}, [r5], r3
-+ vst1.8 {d19}, [r6], r3
-+ vst1.8 {d22}, [r5]
-+ asr r3, #1
-+ vst1.8 {d23}, [r6]
-+
-+ bx lr
-+
-+patch_h_up_8x8_8:
-+ ldrd r8, r9, [r2]
-+ rsb r6, r4, #32
-+ vmov d0, r8, r9
-+ vdup.8 d3, r4
-+ lsr r11, r8, #24
-+ vdup.8 d2, r6
-+ ldr r8, [r2, #-1]!
-+ orr r9, r11, r9, lsl #8
-+ vmov d1, r8, r9
-+ mov r12, r4
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+patch_h_up_8x8_8_continue:
-+ mov r5, #8
-+1:
-+ add r12, r4
-+ mov r11, #0
-+ cmp r12, #33
-+ it cs
-+ addcs r10, r7
-+ vext.8 q8, q8, q9, #8
-+ itt cs
-+ subcs r12, #32
-+ tstcs r10, #1<<31
-+ rsb r6, r12, #32
-+ it eq
-+ asreq r11, r10, #8
-+ it cs
-+ vmovcs d0, r8, r9
-+ vdup.8 d2, r6
-+ it cs
-+ lsrcs r6, r8, #24
-+ vext.8 q9, q9, q10, #8
-+ itt cs
-+ orrcs r9, r6, r9, lsl #8
-+ ldrbcs r11, [r1, r11]
-+ vdup.8 d3, r12
-+ vext.8 q10, q10, q11, #8
-+ it hi
-+ ldrbhi r11, [r2, #-1]!
-+ vmov d22, d23
-+ vrshrn.u16 d23, q2, #5
-+ itt cs
-+ orrcs r8, r11, r8, lsl #8
-+ vmovcs d1, r8, r9
-+ vmull.u8 q2, d0, d2
-+ subs r5, #1
-+ vmlal.u8 q2, d1, d3
-+ bne 1b
-+
-+ b store_tran_8x8_8
-+
-+
-+.macro ADRT reg, val
-+@ adr in T32 has enough range but not in A32
-+A adrl \reg, \val
-+T adr \reg, \val
-+.endm
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r8, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ ldr lr, [r2], #1 @ Top
-+ rsb r12, r6, #32
-+ vmov s0, lr
-+ vdup.8 d3, r6
-+ ldr lr, [r2], #1
-+ vdup.8 d2, r12
-+ vmov s2, lr
-+ subs r12, r4
-+ vmull.u8 q2, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmlal.u8 q2, d1, d3
-+ rsb r6, r12, #32
-+ itt mi
-+ vmovmi s0, lr
-+ ldrmi lr, [r2], #1
-+ vdup.8 d2, r12
-+ it mi
-+ vmovmi s2, lr
-+ vdup.8 d3, r6
-+ mov r5, #2
-+1:
-+ vrshrn.u16 d20, q2, #5
-+ subs r12, r4
-+ vmull.u8 q2, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmlal.u8 q2, d1, d3
-+ rsb r6, r12, #32
-+ vext.64 q8, q8, q9, #1
-+ it mi
-+ vmovmi s0, lr
-+ vext.64 q9, q9, q10, #1
-+ it mi
-+ ldrmi lr, [r2], #1
-+ vdup.8 d2, r12
-+ it mi
-+ vmovmi s2, lr
-+ subs r5, #1
-+ vdup.8 d3, r6
-+ bne 1b
-+
-+ vrshrn.u16 d20, q2, #5
-+ vmull.u8 q2, d0, d2
-+ add r12, r0, r3
-+ vmlal.u8 q2, d1, d3
-+ lsl r3, #1
-+ vext.64 q8, q8, q9, #1
-+ vext.64 q9, q9, q10, #1
-+ vrshrn.u16 d20, q2, #5
-+
-+98:
-+ vst4.8 {d17[0], d18[0], d19[0], d20[0]}, [r0], r3
-+ vst4.8 {d17[1], d18[1], d19[1], d20[1]}, [r12], r3
-+ vst4.8 {d17[2], d18[2], d19[2], d20[2]}, [r0]
-+ vst4.8 {d17[3], d18[3], d19[3], d20[3]}, [r12]
-+ pop {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ rsb r12, r6, #32
-+ ldr lr, [r2] @ Left
-+ ldrb r2, [r2, #-1] @ Top-left
-+ vmov s0, lr
-+ vdup.8 d2, r12
-+ vdup.8 d3, r6
-+ orr lr, r2, lr, lsl #8
-+ vmov s2, lr
-+ sub r8, r7, #128
-+ mov r5, #3
-+2:
-+ vmull.u8 q2, d0, d2
-+ subs r12, r4
-+ vmlal.u8 q2, d1, d3
-+T it mi
-+ addmi r12, #32
-+T asr r6, r8, #8
-+T it mi
-+T ldrbmi r2, [r1, r6]
-+A ldrbmi r2, [r1, r8, asr #8]
-+ rsb r6, r12, #32
-+ vdup.8 d2, r12
-+ ittt mi
-+ vmovmi s0, lr
-+ orrmi lr, r2, lr, lsl #8
-+ vmovmi s2, lr
-+ vrshrn.u16 d20, q2, #5
-+ vdup.8 d3, r6
-+ it mi
-+ addmi r8, r7
-+ subs r5, #1
-+ vext.64 q8, q8, q9, #1
-+ vext.64 q9, q9, q10, #1
-+ bne 2b
-+
-+ vmull.u8 q2, d0, d2
-+ add r12, r0, r3
-+ vmlal.u8 q2, d1, d3
-+ lsl r3, #1
-+ vrshrn.u16 d20, q2, #5
-+ b 98b
-+
-+@ Left of vertical - works down left
-+18:
-+ ldrh r7, [r7]
-+ rsb r12, r6, #32
-+ ldr lr, [r1] @ Top
-+ ldrb r1, [r2, #-1] @ Top-left
-+ vmov s0, lr
-+ vdup.8 d2, r12
-+ vdup.8 d3, r6
-+ orr lr, r1, lr, lsl #8
-+ vmov s2, lr
-+ sub r8, r7, #128
-+ mov r5, #3
-+2:
-+ vmull.u8 q2, d0, d2
-+ subs r12, r4
-+ vmlal.u8 q2, d1, d3
-+T it mi
-+ addmi r12, #32
-+T asr r6, r8, #8
-+T it mi
-+T ldrbmi r1, [r2, r6]
-+A ldrbmi r1, [r2, r8, asr #8]
-+ rsb r6, r12, #32
-+ vdup.8 d2, r12
-+ ittt mi
-+ vmovmi s0, lr
-+ orrmi lr, r1, lr, lsl #8
-+ vmovmi s2, lr
-+ vrshrn.u16 d4, q2, #5
-+ vdup.8 d3, r6
-+ it mi
-+ addmi r8, r7
-+ subs r5, #1
-+ vst1.32 {d4[0]}, [r0], r3
-+ bne 2b
-+
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+ vrshrn.u16 d4, q2, #5
-+ vst1.32 {d4[0]}, [r0]
-+
-+ pop {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ ldr lr, [r1], #1 @ Top
-+ rsb r12, r6, #32
-+ vmov s0, lr
-+ vdup.8 d3, r6
-+ ldr lr, [r1], #1
-+ vdup.8 d2, r12
-+ vmov s2, lr
-+ subs r12, r4
-+ vmull.u8 q2, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmlal.u8 q2, d1, d3
-+ rsb r6, r12, #32
-+ itt mi
-+ vmovmi s0, lr
-+ ldrmi lr, [r1], #1
-+ vdup.8 d2, r12
-+ it mi
-+ vmovmi s2, lr
-+ vdup.8 d3, r6
-+ mov r5, #2
-+1:
-+ vrshrn.u16 d6, q2, #5
-+ subs r12, r4
-+ vmull.u8 q2, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmlal.u8 q2, d1, d3
-+ rsb r6, r12, #32
-+ vst1.32 {d6[0]}, [r0], r3
-+ itt mi
-+ vmovmi s0, lr
-+ ldrmi lr, [r1], #1
-+ vdup.8 d2, r12
-+ it mi
-+ vmovmi s2, lr
-+ subs r5, #1
-+ vdup.8 d3, r6
-+ bne 1b
-+
-+ vrshrn.u16 d6, q2, #5
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+ vst1.32 {d6[0]}, [r0], r3
-+ vrshrn.u16 d6, q2, #5
-+ vst1.32 {d6[0]}, [r0]
-+
-+ pop {r4-r8, pc}
-+
-+endfunc
-+
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ bl patch_h_down_8x8_8
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ bl patch_h_up_8x8_8
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ ldrd r8, r9, [r1] @ Top
-+ rsb r12, r6, #32
-+ ldrb lr, [r2, #-1] @ Top-left
-+ ldrh r7, [r7]
-+ vmov d0, r8, r9
-+ lsl r9, r9, #8
-+ vdup.8 d2, r12
-+ orr r9, r9, r8, lsr #24
-+ orr r8, lr, r8, lsl #8
-+ vmov d1, r8, r9
-+ sub r1, r7, #128
-+ mov r5, #7
-+1:
-+ vdup.8 d3, r6
-+ vmull.u8 q2, d0, d2
-+ subs r12, r12, r4
-+ vmlal.u8 q2, d1, d3
-+ ittt mi
-+ addmi lr, r2, r1, asr #8
-+ addmi r12, r12, #32
-+ vmovmi d0, r8, r9
-+ rsb r6, r12, #32
-+ itt mi
-+ lslmi r9, r9, #8
-+ ldrbmi lr, [lr]
-+ vdup.8 d2, r12
-+ vrshrn.u16 d4, q2, #5
-+ itttt mi
-+ orrmi r9, r9, r8, lsr #24
-+ orrmi r8, lr, r8, lsl #8
-+ vmovmi d1, r8, r9
-+ addmi r1, r1, r7
-+ subs r5, r5, #1
-+ vst1.8 {d4}, [r0], r3
-+ bne 1b
-+
-+ vdup.8 d3, r6
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+ vrshrn.u16 d4, q2, #5
-+ vst1.8 {d4}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ ldrd r8, r9, [r1] @ Top
-+ rsb r12, r6, #32
-+ vmov d0, r8, r9
-+ vdup.8 d3, r6
-+ mov r5, #7
-+ lsr r8, #8
-+ vdup.8 d2, r12
-+ orr r8, r8, r9, lsl #24
-+ ldr r9, [r1, #5]!
-+ vmov d1, r8, r9
-+1:
-+ vmull.u8 q2, d0, d2
-+ subs r12, r4
-+ vmlal.u8 q2, d1, d3
-+ it mi
-+ addmi r12, #32
-+ rsb r6, r12, #32
-+ itt mi
-+ vmovmi d0, r8, r9
-+ lsrmi r8, #8
-+ vdup.8 d2, r12
-+ itt mi
-+ orrmi r8, r8, r9, lsl #24
-+ ldrmi r9, [r1, #1]!
-+ vrshrn.u16 d6, q2, #5
-+ it mi
-+ vmovmi d1, r8, r9
-+ vdup.8 d3, r6
-+ subs r5, #1
-+ vst1.8 {d6}, [r0], r3
-+ bne 1b
-+
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+ vrshrn.u16 d6, q2, #5
-+ vst1.8 {d6}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r1, r2 @ save r2 - r1 unused by patch_down
-+
-+ bl patch_h_down_8x8_8
-+ bl patch_h_down_8x8_8_continue
-+
-+ add r2, r1, #8 @ restore r2, but 8 rows further down left
-+ sub r0, #16
-+ mov r6, r4
-+ add r0, r0, r3, lsl #3
-+
-+ bl patch_h_down_8x8_8
-+ bl patch_h_down_8x8_8_continue
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+
-+ push {r2}
-+ bl patch_h_up_8x8_8
-+ bl patch_h_up_8x8_8_continue
-+ pop {r2}
-+
-+ sub r0, #16
-+ mov r10, #-128
-+ add r2, #8
-+ add r0, r0, r3, lsl #3
-+ sub r10, r10, r7, lsl #3
-+
-+ bl patch_h_up_8x8_8
-+ bl patch_h_up_8x8_8_continue
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.8 {q9}, [r1]
-+ sub r1, r2, #1
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ vdup.8 d6, r6
-+ vext.8 q8, q9, q9, #15
-+ sub r8, r7, #128
-+ vld1.8 {d16[0]}, [r1]
-+ vdup.8 d7, r12
-+ mov r5, #15
-+1:
-+ vmull.u8 q0, d18, d7
-+ subs r12, r4
-+ vmlal.u8 q0, d16, d6
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d19, d7
-+ it cc
-+ addcc r1, r2, r8, asr #8
-+ vmlal.u8 q1, d17, d6
-+ rsb r6, r12, #32
-+ vext.8 q10, q8, q8, #15
-+ sub r5, #1
-+ vld1.8 {d20[0]}, [r1]
-+ it cc
-+ addcc r8, r7
-+ vmov q11, q8
-+ teq r5, #0
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmull.u8 q0, d22, d7
-+ subs r12, r4
-+ vmlal.u8 q0, d20, d6
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d23, d7
-+ it cc
-+ addcc r1, r2, r8, asr #8
-+ vmlal.u8 q1, d21, d6
-+ rsb r6, r12, #32
-+ vext.8 q8, q10, q10, #15
-+ sub r5, #1
-+ vld1.8 {d16[0]}, [r1]
-+ it cc
-+ addcc r8, r7
-+ vmov q9, q10
-+ teq r5, #0
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmull.u8 q0, d22, d7
-+ vmlal.u8 q0, d20, d6
-+ vmull.u8 q1, d23, d7
-+ vmlal.u8 q1, d21, d6
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmull.u8 q0, d18, d7
-+ vmlal.u8 q0, d16, d6
-+ vmull.u8 q1, d19, d7
-+ vmlal.u8 q1, d17, d6
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ vld1.8 {q9}, [r1]!
-+ rsb r12, r6, #32
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vext.8 q8, q9, q9, #1
-+ vld1.8 {d17[7]}, [r1]!
-+ mov r5, #15
-+1:
-+ vmull.u8 q0, d16, d6
-+ subs r12, r4
-+ vmlal.u8 q0, d18, d7
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d17, d6
-+ rsb r6, r12, #32
-+ vmlal.u8 q1, d19, d7
-+ sub r5, #1
-+ vext.8 q10, q8, q8, #1
-+ teq r5, #0
-+ vld1.8 {d21[7]}, [r1]
-+ it cc
-+ addcc r1, #1
-+ vmov q11, q8
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmull.u8 q0, d20, d6
-+ subs r12, r4
-+ vmlal.u8 q0, d22, d7
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d21, d6
-+ rsb r6, r12, #32
-+ vmlal.u8 q1, d23, d7
-+ sub r5, #1
-+ vext.8 q8, q10, q10, #1
-+ teq r5, #0
-+ vld1.8 {d17[7]}, [r1]
-+ it cc
-+ addcc r1, #1
-+ vmov q9, q10
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmull.u8 q0, d20, d6
-+ vmlal.u8 q0, d22, d7
-+ vmull.u8 q1, d21, d6
-+ vmlal.u8 q1, d23, d7
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmull.u8 q0, d16, d6
-+ vmlal.u8 q0, d18, d7
-+ vmull.u8 q1, d17, d6
-+ vmlal.u8 q1, d19, d7
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r10, #4
-+ mov r1, r2
-+1:
-+ bl patch_h_down_8x8_8
-+ bl patch_h_down_8x8_8_continue
-+ bl patch_h_down_8x8_8_continue
-+ bl patch_h_down_8x8_8_continue
-+
-+ add r2, r1, #8 @ restore r2, but 8 rows further down left
-+ add r1, r1, #8
-+ mov r6, r4
-+ sub r0, #32
-+ subs r10, #1
-+ add r0, r0, r3, lsl #3
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ vmov.i8 d6, #1<<2
-+1:
-+ push {r2,r10}
-+ bl patch_h_up_8x8_8
-+ bl patch_h_up_8x8_8_continue
-+ bl patch_h_up_8x8_8_continue
-+ bl patch_h_up_8x8_8_continue
-+ pop {r2,r10}
-+
-+ vmov r8, s12
-+ sub r0, #32
-+ add r2, #8
-+ add r0, r0, r3, lsl #3
-+ sub r10, r10, r7, lsl #3
-+ vshr.u8 d6, #1
-+ teq r8, #0
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.8 {q0-q1}, [r1]
-+ sub r9, r2, #1
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+ mov r5, #32
-+1:
-+ vld1.8 {d17[7]}, [r9]
-+ add r8, r7
-+ vmov q2, q0
-+ vmov q3, q1
-+ add r9, r2, r8, asr #8
-+ vext.8 q1, q0, q1, #15
-+ vext.8 q0, q8, q0, #15
-+2:
-+ vmull.u8 q10, d4, d19
-+ subs r12, r4
-+ vmlal.u8 q10, d0, d18
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q11, d5, d19
-+ rsb r6, r12, #32
-+ vmlal.u8 q11, d1, d18
-+ sub r5, #1
-+ vmull.u8 q12, d6, d19
-+ teq r5, #0
-+ vmlal.u8 q12, d2, d18
-+ vmull.u8 q13, d7, d19
-+ vmlal.u8 q13, d3, d18
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+ vrshrn.u16 d20, q10, #5
-+ vrshrn.u16 d21, q11, #5
-+ vrshrn.u16 d22, q12, #5
-+ vrshrn.u16 d23, q13, #5
-+ vst1.8 {q10-q11}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ add r5, r1, #32
-+ vld1.8 {q0-q1}, [r1]!
-+ rsb r12, r6, #32
-+ vld1.8 {d16[0]}, [r5]
-+ mov r5, #32
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+1:
-+ vmov q2, q0
-+ add r1, #1
-+ vmov q3, q1
-+ vext.8 q0, q0, q1, #1
-+ vext.8 q1, q1, q8, #1
-+2:
-+ vmull.u8 q10, d0, d18
-+ subs r12, r4
-+ vmlal.u8 q10, d4, d19
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q11, d1, d18
-+ rsb r6, r12, #32
-+ vmlal.u8 q11, d5, d19
-+ sub r5, #1
-+ vmull.u8 q12, d2, d18
-+ teq r5, #0
-+ vmlal.u8 q12, d6, d19
-+ vmull.u8 q13, d3, d18
-+ vmlal.u8 q13, d7, d19
-+ vld1.8 {d16[0]}, [r1]
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+ vrshrn.u16 d20, q10, #5
-+ vrshrn.u16 d21, q11, #5
-+ vrshrn.u16 d22, q12, #5
-+ vrshrn.u16 d23, q13, #5
-+ vst1.8 {q10-q11}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ Chroma 8 bit 4x4 patch fns
-+ .text
-+
-+patch_h_down_c_4x4_8:
-+ ldrd r8, r9, [r2] @ Left
-+ rsb r12, r6, #32
-+ vmov d0, r8, r9
-+ vdup.8 d3, r6
-+ lsr r8, #16
-+ vdup.8 d2, r12
-+ orr r8, r8, r9, lsl #16
-+ ldr r9, [r2, #6]!
-+ vmov d1, r8, r9
-+ // drop through...
-+patch_h_down_c_4x4_8_continue:
-+ mov r5, #4
-+1:
-+ subs r12, r4
-+ vmull.u8 q2, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmlal.u8 q2, d1, d3
-+ rsb r6, r12, #32
-+ vext.8 q8, q8, q9, #8
-+ it mi
-+ lsrmi r7, r8, #16
-+ vmov d18, d19
-+ it mi
-+ vmovmi d0, r8, r9
-+ vdup.8 d2, r12
-+ it mi
-+ orrmi r8, r7, r9, lsl #16
-+ vrshrn.u16 d19, q2, #5
-+ itt mi
-+ ldrmi r9, [r2, #2]!
-+ vmovmi d1, r8, r9
-+ subs r5, #1
-+ vdup.8 d3, r6
-+ bne 1b
-+ // drop through...
-+store_tran_c_4x4_8:
-+ vzip.16 d16, d17
-+ add r6, r0, r3
-+ vzip.16 d18, d19
-+ lsl r3, #1
-+ vzip.32 q8, q9
-+ add r5, r0, r3
-+ vst1.16 {d16}, [r0]!
-+ vst1.16 {d17}, [r6], r3
-+ vst1.16 {d18}, [r5]
-+ asr r3, #1
-+ vst1.16 {d19}, [r6]
-+
-+ bx lr
-+
-+patch_h_up_c_4x4_8:
-+ ldrd r8, r9, [r2]
-+ rsb r6, r4, #32
-+ vmov d0, r8, r9
-+ vdup.8 d3, r4
-+ lsr r11, r8, #16
-+ vdup.8 d2, r6
-+ ldr r8, [r2, #-2]!
-+ orr r9, r11, r9, lsl #16
-+ vmov d1, r8, r9
-+ mov r12, r4
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+patch_h_up_c_4x4_8_continue:
-+ mov r5, #4
-+1:
-+ add r12, r4
-+ cmp r12, #33
-+ it cs
-+ addcs r10, r7
-+ mov r11, #0
-+ itt cs
-+ subcs r12, #32
-+ tstcs r10, #1<<31
-+ rsb r6, r12, #32
-+ it eq
-+ asreq r11, r10, #7
-+ it cs
-+ vmovcs d0, r8, r9
-+ it eq
-+ biceq r11, #1
-+ vdup.8 d2, r6
-+ it cs
-+ lsrcs r6, r8, #16
-+ vdup.8 d3, r12
-+ vext.8 q8, q8, q9, #8
-+ itt cs
-+ orrcs r9, r6, r9, lsl #16
-+ ldrhcs r11, [r1, r11]
-+ vmov d18, d19
-+ it hi
-+ ldrhhi r11, [r2, #-2]!
-+ vrshrn.u16 d19, q2, #5
-+ itt cs
-+ orrcs r8, r11, r8, lsl #16
-+ vmovcs d1, r8, r9
-+ vmull.u8 q2, d0, d2
-+ subs r5, #1
-+ vmlal.u8 q2, d1, d3
-+ bne 1b
-+
-+ b store_tran_c_4x4_8
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ bl patch_h_down_c_4x4_8
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ bl patch_h_up_c_4x4_8
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ ldrd r8, r9, [r1] @ Top
-+ rsb r12, r6, #32
-+ ldrh lr, [r2, #-2] @ Top-left
-+ ldrh r7, [r7]
-+ vmov d0, r8, r9
-+ lsl r9, r9, #16
-+ vdup.8 d2, r12
-+ orr r9, r9, r8, lsr #16
-+ orr r8, lr, r8, lsl #16
-+ vmov d1, r8, r9
-+ sub r1, r7, #128
-+ mov r5, #3
-+1:
-+ vdup.8 d3, r6
-+ vmull.u8 q2, d0, d2
-+ subs r12, r12, r4
-+ vmlal.u8 q2, d1, d3
-+ itttt mi
-+ addmi lr, r2, r1, asr #7
-+ bicmi lr, #1
-+ addmi r12, r12, #32
-+ vmovmi d0, r8, r9
-+ rsb r6, r12, #32
-+ itt mi
-+ lslmi r9, r9, #16
-+ ldrhmi lr, [lr]
-+ vdup.8 d2, r12
-+ vrshrn.u16 d4, q2, #5
-+ itttt mi
-+ orrmi r9, r9, r8, lsr #16
-+ orrmi r8, lr, r8, lsl #16
-+ vmovmi d1, r8, r9
-+ addmi r1, r1, r7
-+ subs r5, r5, #1
-+ vst1.16 {d4}, [r0], r3
-+ bne 1b
-+
-+ vdup.8 d3, r6
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+ vrshrn.u16 d4, q2, #5
-+ vst1.16 {d4}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ ldrd r8, r9, [r1] @ Top
-+ rsb r12, r6, #32
-+ vmov d0, r8, r9
-+ vdup.8 d3, r6
-+ mov r5, #3
-+ lsr r8, #16
-+ vdup.8 d2, r12
-+ orr r8, r8, r9, lsl #16
-+ ldr r9, [r1, #6]!
-+ vmov d1, r8, r9
-+1:
-+ vmull.u8 q2, d0, d2
-+ subs r12, r4
-+ vmlal.u8 q2, d1, d3
-+ it mi
-+ addmi r12, #32
-+ rsb r6, r12, #32
-+ itt mi
-+ vmovmi d0, r8, r9
-+ lsrmi r8, #16
-+ vdup.8 d2, r12
-+ itt mi
-+ orrmi r8, r8, r9, lsl #16
-+ ldrmi r9, [r1, #2]!
-+ vrshrn.u16 d6, q2, #5
-+ it mi
-+ vmovmi d1, r8, r9
-+ vdup.8 d3, r6
-+ subs r5, #1
-+ vst1.16 {d6}, [r0], r3
-+ bne 1b
-+
-+ vmull.u8 q2, d0, d2
-+ vmlal.u8 q2, d1, d3
-+ vrshrn.u16 d6, q2, #5
-+ vst1.16 {d6}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r1, r2 @ save r2 - r1 unused by patch_down
-+
-+ bl patch_h_down_c_4x4_8
-+ bl patch_h_down_c_4x4_8_continue
-+
-+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
-+ sub r0, #16
-+ mov r6, r4
-+ add r0, r0, r3, lsl #2
-+
-+ bl patch_h_down_c_4x4_8
-+ bl patch_h_down_c_4x4_8_continue
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+
-+ push {r2}
-+ bl patch_h_up_c_4x4_8
-+ bl patch_h_up_c_4x4_8_continue
-+ pop {r2}
-+
-+ sub r0, #16
-+ mov r10, #-128
-+ add r2, #8
-+ add r0, r0, r3, lsl #2
-+ sub r10, r10, r7, lsl #2
-+
-+ bl patch_h_up_c_4x4_8
-+ bl patch_h_up_c_4x4_8_continue
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.8 {q9}, [r1]
-+ sub r1, r2, #2
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ vdup.8 d6, r6
-+ vext.8 q8, q9, q9, #14
-+ sub r8, r7, #128
-+ vld1.16 {d16[0]}, [r1]
-+ vdup.8 d7, r12
-+ mov r5, #7
-+1:
-+ subs r12, r4
-+ vmull.u8 q0, d18, d7
-+ it cc
-+ asrcc r1, r8, #8
-+ vmlal.u8 q0, d16, d6
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d19, d7
-+ it cc
-+ addcc r1, r2, r1, lsl #1
-+ vmlal.u8 q1, d17, d6
-+ rsb r6, r12, #32
-+ vext.8 q10, q8, q8, #14
-+ sub r5, #1
-+ vld1.16 {d20[0]}, [r1]
-+ it cc
-+ addcc r8, r7
-+ vmov q11, q8
-+ teq r5, #0
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ subs r12, r4
-+ vmull.u8 q0, d22, d7
-+ it cc
-+ asrcc r1, r8, #8
-+ vmlal.u8 q0, d20, d6
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d23, d7
-+ it cc
-+ addcc r1, r2, r1, lsl #1
-+ vmlal.u8 q1, d21, d6
-+ rsb r6, r12, #32
-+ vext.8 q8, q10, q10, #14
-+ sub r5, #1
-+ vld1.16 {d16[0]}, [r1]
-+ it cc
-+ addcc r8, r7
-+ vmov q9, q10
-+ teq r5, #0
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmull.u8 q0, d22, d7
-+ vmlal.u8 q0, d20, d6
-+ vmull.u8 q1, d23, d7
-+ vmlal.u8 q1, d21, d6
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmull.u8 q0, d18, d7
-+ vmlal.u8 q0, d16, d6
-+ vmull.u8 q1, d19, d7
-+ vmlal.u8 q1, d17, d6
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ vld1.8 {q9}, [r1]!
-+ rsb r12, r6, #32
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vext.8 q8, q9, q9, #2
-+ vld1.16 {d17[3]}, [r1]!
-+ mov r5, #7
-+1:
-+ vmull.u8 q0, d16, d6
-+ subs r12, r4
-+ vmlal.u8 q0, d18, d7
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d17, d6
-+ rsb r6, r12, #32
-+ vmlal.u8 q1, d19, d7
-+ sub r5, #1
-+ vext.8 q10, q8, q8, #2
-+ teq r5, #0
-+ vld1.16 {d21[3]}, [r1]
-+ it cc
-+ addcc r1, #2
-+ vmov q11, q8
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmull.u8 q0, d20, d6
-+ subs r12, r4
-+ vmlal.u8 q0, d22, d7
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q1, d21, d6
-+ rsb r6, r12, #32
-+ vmlal.u8 q1, d23, d7
-+ sub r5, #1
-+ vext.8 q8, q10, q10, #2
-+ teq r5, #0
-+ vld1.16 {d17[3]}, [r1]
-+ it cc
-+ addcc r1, #2
-+ vmov q9, q10
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vdup.8 d6, r6
-+ vdup.8 d7, r12
-+ vst1.8 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmull.u8 q0, d20, d6
-+ vmlal.u8 q0, d22, d7
-+ vmull.u8 q1, d21, d6
-+ vmlal.u8 q1, d23, d7
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmull.u8 q0, d16, d6
-+ vmlal.u8 q0, d18, d7
-+ vmull.u8 q1, d17, d6
-+ vmlal.u8 q1, d19, d7
-+ vrshrn.u16 d0, q0, #5
-+ vrshrn.u16 d1, q1, #5
-+ vst1.8 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_8, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r10, #4
-+ mov r1, r2
-+1:
-+ bl patch_h_down_c_4x4_8
-+ bl patch_h_down_c_4x4_8_continue
-+ bl patch_h_down_c_4x4_8_continue
-+ bl patch_h_down_c_4x4_8_continue
-+
-+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
-+ add r1, r1, #4*2
-+ mov r6, r4
-+ sub r0, #32
-+ subs r10, #1
-+ add r0, r0, r3, lsl #2
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ vmov.i8 d6, #1<<2
-+1:
-+ push {r2, r10}
-+ bl patch_h_up_c_4x4_8
-+ bl patch_h_up_c_4x4_8_continue
-+ bl patch_h_up_c_4x4_8_continue
-+ bl patch_h_up_c_4x4_8_continue
-+ pop {r2, r10}
-+
-+ vmov r8, s12
-+ sub r0, #32
-+ add r2, #8
-+ add r0, r0, r3, lsl #2
-+ sub r10, r10, r7, lsl #2
-+ vshr.u8 d6, #1
-+ teq r8, #0
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.8 {q0-q1}, [r1]
-+ sub r9, r2, #2
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+ mov r5, #16
-+1:
-+ vld1.16 {d17[3]}, [r9]
-+ add r8, r7
-+ vmov q2, q0
-+ vmov q3, q1
-+ asr r9, r8, #8
-+ vext.8 q1, q0, q1, #14
-+ add r9, r2, r9, lsl #1
-+ vext.8 q0, q8, q0, #14
-+2:
-+ vmull.u8 q10, d4, d19
-+ subs r12, r4
-+ vmlal.u8 q10, d0, d18
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q11, d5, d19
-+ rsb r6, r12, #32
-+ vmlal.u8 q11, d1, d18
-+ sub r5, #1
-+ vmull.u8 q12, d6, d19
-+ teq r5, #0
-+ vmlal.u8 q12, d2, d18
-+ vmull.u8 q13, d7, d19
-+ vmlal.u8 q13, d3, d18
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+ vrshrn.u16 d20, q10, #5
-+ vrshrn.u16 d21, q11, #5
-+ vrshrn.u16 d22, q12, #5
-+ vrshrn.u16 d23, q13, #5
-+ vst1.8 {q10-q11}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ add r5, r1, #32
-+ vld1.8 {q0-q1}, [r1]!
-+ rsb r12, r6, #32
-+ vld1.16 {d16[0]}, [r5]
-+ mov r5, #16
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+1:
-+ vmov q2, q0
-+ add r1, #2
-+ vmov q3, q1
-+ vext.8 q0, q0, q1, #2
-+ vext.8 q1, q1, q8, #2
-+2:
-+ vmull.u8 q10, d0, d18
-+ subs r12, r4
-+ vmlal.u8 q10, d4, d19
-+ it cc
-+ addcc r12, #32
-+ vmull.u8 q11, d1, d18
-+ rsb r6, r12, #32
-+ vmlal.u8 q11, d5, d19
-+ sub r5, #1
-+ vmull.u8 q12, d2, d18
-+ teq r5, #0
-+ vmlal.u8 q12, d6, d19
-+ vmull.u8 q13, d3, d18
-+ vmlal.u8 q13, d7, d19
-+ vld1.16 {d16[0]}, [r1]
-+ vdup.8 d18, r6
-+ vdup.8 d19, r12
-+ vrshrn.u16 d20, q10, #5
-+ vrshrn.u16 d21, q11, #5
-+ vrshrn.u16 d22, q12, #5
-+ vrshrn.u16 d23, q13, #5
-+ vst1.8 {q10-q11}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+@------------------------------------------------------------------------------
-+@ Data
-+
-+ .text
-+ .balign 64
-+angle_2:
-+ .byte 32
-+ .byte 26, 21, 17, 13, 9, 5, 2, 0
-+ @ Sign inverted from standards table
-+ .byte 2, 5, 9, 13, 17, 21, 26, 32
-+ .byte 26, 21, 17, 13, 9, 5, 2, 0
-+ @ Standard sign
-+ .byte 2, 5, 9, 13, 17, 21, 26, 32
-+
-+ .balign 2
-+
-+ @ Sign inverted from standards table
-+inv_angle:
-+ .short 4096, 1638, 910, 630, 482, 390, 315
-+ .short 256
-+ .short 315, 390, 482, 630, 910, 1638, 4096
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 bit fns
-+@ Should work for 9 & 11 bit as there is no actual bit-depth specific code
-+@ but runs out of register width for 12+ bit
-+
-+ .text
-+ .balign 64
-+
-+patch_h_down_4x4_10:
-+ ldrd r8, r9, [r2] @ Left
-+ rsb r12, r6, #32
-+ vmov d0, r8, r9
-+ vdup.16 d3, r6
-+ lsr r8, #16
-+ vdup.16 d2, r12
-+ orr r8, r8, r9, lsl #16
-+ ldr r9, [r2, #6]!
-+ vmov d1, r8, r9
-+ // drop through...
-+patch_h_down_4x4_10_continue:
-+ mov r5, #4
-+1:
-+ subs r12, r4
-+ vmul.u16 d4, d0, d2
-+ it mi
-+ addmi r12, #32
-+ vmla.u16 d4, d1, d3
-+ rsb r6, r12, #32
-+ vext.16 q8, q8, q9, #4
-+ it mi
-+ lsrmi r7, r8, #16
-+ vmov d18, d19
-+ it mi
-+ vmovmi d0, r8, r9
-+ vdup.16 d2, r12
-+ it mi
-+ orrmi r8, r7, r9, lsl #16
-+ vrshr.u16 d19, d4, #5
-+ itt mi
-+ ldrmi r9, [r2, #2]!
-+ vmovmi d1, r8, r9
-+ subs r5, #1
-+ vdup.16 d3, r6
-+ bne 1b
-+ // drop through...
-+store_tran_4x4_10:
-+ vzip.16 d16, d17
-+ add r6, r0, r3
-+ vzip.16 d18, d19
-+ lsl r3, #1
-+ vzip.32 q8, q9
-+ add r5, r0, r3
-+ vst1.16 {d16}, [r0]!
-+ vst1.16 {d17}, [r6], r3
-+ vst1.16 {d18}, [r5]
-+ asr r3, #1
-+ vst1.16 {d19}, [r6]
-+
-+ bx lr
-+
-+patch_h_up_4x4_10:
-+ ldrd r8, r9, [r2]
-+ rsb r6, r4, #32
-+ vmov d0, r8, r9
-+ vdup.16 d3, r4
-+ lsr r11, r8, #16
-+ vdup.16 d2, r6
-+ ldr r8, [r2, #-2]!
-+ orr r9, r11, r9, lsl #16
-+ vmov d1, r8, r9
-+ mov r12, r4
-+ vmul.u16 d4, d0, d2
-+ vmla.u16 d4, d1, d3
-+patch_h_up_4x4_10_continue:
-+ mov r5, #4
-+1:
-+ add r12, r4
-+ cmp r12, #33
-+ it cs
-+ addcs r10, r7
-+ mov r11, #0
-+ itt cs
-+ subcs r12, #32
-+ tstcs r10, #1<<31
-+ rsb r6, r12, #32
-+ it eq
-+ asreq r11, r10, #7
-+ it cs
-+ vmovcs d0, r8, r9
-+ it eq
-+ biceq r11, #1
-+ vdup.16 d2, r6
-+ it cs
-+ lsrcs r6, r8, #16
-+ vdup.16 d3, r12
-+ vext.16 q8, q8, q9, #4
-+ itt cs
-+ orrcs r9, r6, r9, lsl #16
-+ ldrhcs r11, [r1, r11]
-+ vmov d18, d19
-+ it hi
-+ ldrhhi r11, [r2, #-2]!
-+ vrshr.u16 d19, d4, #5
-+ itt cs
-+ orrcs r8, r11, r8, lsl #16
-+ vmovcs d1, r8, r9
-+ vmul.u16 d4, d0, d2
-+ subs r5, #1
-+ vmla.u16 d4, d1, d3
-+ bne 1b
-+
-+ b store_tran_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_4_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ bl patch_h_down_4x4_10
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ bl patch_h_up_4x4_10
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ ldrd r8, r9, [r1] @ Top
-+ rsb r12, r6, #32
-+ ldrh lr, [r2, #-2] @ Top-left
-+ ldrh r7, [r7]
-+ vmov d0, r8, r9
-+ lsl r9, r9, #16
-+ vdup.16 d2, r12
-+ orr r9, r9, r8, lsr #16
-+ orr r8, lr, r8, lsl #16
-+ vmov d1, r8, r9
-+ sub r1, r7, #128
-+ mov r5, #3
-+1:
-+ sel lr, lr, lr @ force pipeline 0 on Cortex-A53
-+ vdup.16 d3, r6
-+ vmul.u16 d4, d0, d2
-+ subs r12, r12, r4
-+ vmla.u16 d4, d1, d3
-+ itttt mi
-+ addmi lr, r2, r1, asr #7
-+ bicmi lr, #1
-+ addmi r12, r12, #32
-+ vmovmi d0, r8, r9
-+ rsb r6, r12, #32
-+ itt mi
-+ lslmi r9, r9, #16
-+ ldrhmi lr, [lr]
-+ vdup.16 d2, r12
-+ vrshr.u16 d4, d4, #5
-+ itttt mi
-+ orrmi r9, r9, r8, lsr #16
-+ orrmi r8, lr, r8, lsl #16
-+ vmovmi d1, r8, r9
-+ addmi r1, r1, r7
-+ subs r5, r5, #1
-+ vst1.16 {d4}, [r0], r3
-+ bne 1b
-+
-+ vdup.16 d3, r6
-+ nop @ force next insn into pipeline 0 to enable
-+ vmul.u16 d4, d0, d2 @ vmla to execute back-to-back on Cortex-A53
-+ vmla.u16 d4, d1, d3
-+ vrshr.u16 d4, d4, #5
-+ vst1.16 {d4}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ ldrd r8, r9, [r1] @ Top
-+ rsb r12, r6, #32
-+ vmov d0, r8, r9
-+ vdup.16 d3, r6
-+ lsr r8, #16
-+ vdup.16 d2, r12
-+ orr r8, r8, r9, lsl #16
-+ ldr r9, [r1, #6]!
-+ vmov d1, r8, r9
-+ mov r5, #3
-+1:
-+ vmul.u16 d4, d0, d2
-+ subs r12, r4
-+ vmla.u16 d4, d1, d3
-+ it mi
-+ addmi r12, #32
-+ rsb r6, r12, #32
-+ itt mi
-+ vmovmi d0, r8, r9
-+ lsrmi r8, #16
-+ vdup.16 d2, r12
-+ itt mi
-+ orrmi r8, r8, r9, lsl #16
-+ ldrmi r9, [r1, #2]!
-+ vrshr.u16 d4, d4, #5
-+ it mi
-+ vmovmi d1, r8, r9
-+ vdup.16 d3, r6
-+ subs r5, #1
-+ vst1.16 {d4}, [r0], r3
-+ bne 1b
-+
-+ vmul.u16 d4, d0, d2
-+ vmla.u16 d4, d1, d3
-+ vrshr.u16 d4, d4, #5
-+ vst1.16 {d4}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_8_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r1, r2 @ save r2 - r1 unused by patch_down
-+
-+ bl patch_h_down_4x4_10
-+ bl patch_h_down_4x4_10_continue
-+
-+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
-+ sub r0, #16
-+ mov r6, r4
-+ add r0, r0, r3, lsl #2
-+
-+ bl patch_h_down_4x4_10
-+ bl patch_h_down_4x4_10_continue
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+
-+ push {r2}
-+ bl patch_h_up_4x4_10
-+ bl patch_h_up_4x4_10_continue
-+ pop {r2}
-+
-+ sub r0, #16
-+ mov r10, #-128
-+ add r2, #8
-+ add r0, r0, r3, lsl #2
-+ sub r10, r10, r7, lsl #2
-+
-+ bl patch_h_up_4x4_10
-+ bl patch_h_up_4x4_10_continue
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.16 {q9}, [r1]
-+ sub r1, r2, #2
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ vdup.16 q2, r6
-+ vext.16 q8, q9, q9, #7
-+ sub r8, r7, #128
-+ vld1.16 {d16[0]}, [r1]
-+ vdup.16 q3, r12
-+ mov r5, #7
-+1:
-+ vmul.u16 q0, q9, q3
-+ subs r12, r4
-+ vmla.u16 q0, q8, q2
-+ ittt cc
-+ asrcc r1, r8, #8
-+ addcc r12, #32
-+ addcc r1, r2, r1, lsl #1
-+ vext.16 q10, q8, q8, #7
-+ rsb r6, r12, #32
-+ vmov q11, q8
-+ sub r5, #1
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r8, r7
-+ vld1.16 {d20[0]}, [r1]
-+ teq r5, #0
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmul.u16 q0, q11, q3
-+ subs r12, r4
-+ vmla.u16 q0, q10, q2
-+ ittt cc
-+ asrcc r1, r8, #8
-+ addcc r12, #32
-+ addcc r1, r2, r1, lsl #1
-+ vext.16 q8, q10, q10, #7
-+ rsb r6, r12, #32
-+ vmov q9, q10
-+ sub r5, #1
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r8, r7
-+ vld1.16 {d16[0]}, [r1]
-+ teq r5, #0
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmul.u16 q0, q11, q3
-+ vmla.u16 q0, q10, q2
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmul.u16 q0, q9, q3
-+ vmla.u16 q0, q8, q2
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ vld1.16 {q9}, [r1]!
-+ rsb r12, r6, #32
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vext.16 q8, q9, q9, #1
-+ vld1.16 {d17[3]}, [r1]!
-+ mov r5, #7
-+1:
-+ vmul.u16 q0, q8, q2
-+ subs r12, r4
-+ vmla.u16 q0, q9, q3
-+ it cc
-+ addcc r12, #32
-+ vext.16 q10, q8, q8, #1
-+ rsb r6, r12, #32
-+ vld1.16 {d21[3]}, [r1]
-+ sub r5, #1
-+ vmov q11, q8
-+ teq r5, #0
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r1, #2
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmul.u16 q0, q10, q2
-+ subs r12, r4
-+ vmla.u16 q0, q11, q3
-+ it cc
-+ addcc r12, #32
-+ vext.16 q8, q10, q10, #1
-+ rsb r6, r12, #32
-+ vld1.16 {d17[3]}, [r1]
-+ sub r5, #1
-+ vmov q9, q10
-+ teq r5, #0
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r1, #2
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmul.u16 q0, q10, q2
-+ vmla.u16 q0, q11, q3
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmul.u16 q0, q8, q2
-+ vmla.u16 q0, q9, q3
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_16_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r10, #4
-+ mov r1, r2
-+1:
-+ bl patch_h_down_4x4_10
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+
-+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
-+ add r1, r1, #4*2
-+ mov r6, r4
-+ sub r0, #32
-+ subs r10, #1
-+ add r0, r0, r3, lsl #2
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ vmov.i8 d6, #1<<2
-+1:
-+ push {r2, r10}
-+ bl patch_h_up_4x4_10
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ pop {r2, r10}
-+
-+ vmov r8, s12
-+ sub r0, #32
-+ add r2, #8
-+ add r0, r0, r3, lsl #2
-+ sub r10, r10, r7, lsl #2
-+ vshr.u8 d6, #1
-+ teq r8, #0
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.16 {q0-q1}, [r1]
-+ sub r9, r2, #2
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+ mov r5, #16
-+1:
-+ vld1.16 {d17[3]}, [r9]
-+ add r8, r7
-+ vmov q2, q0
-+ vmov q3, q1
-+ asr r9, r8, #8
-+ vext.16 q1, q0, q1, #7
-+ add r9, r2, r9, lsl #1
-+ vext.16 q0, q8, q0, #7
-+2:
-+ vmul.u16 q11, q2, q10
-+ subs r12, r4
-+ vmla.u16 q11, q0, q9
-+ it cc
-+ addcc r12, #32
-+ vmul.u16 q12, q3, q10
-+ rsb r6, r12, #32
-+ vmla.u16 q12, q1, q9
-+ sub r5, #1
-+ teq r5, #0
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+ vrshr.u16 q11, q11, #5
-+ vrshr.u16 q12, q12, #5
-+ vst1.16 {q11-q12}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ add r5, r1, #32
-+ vld1.16 {q0-q1}, [r1]!
-+ rsb r12, r6, #32
-+ vld1.16 {d16[0]}, [r5]
-+ mov r5, #16
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+1:
-+ vmov q2, q0
-+ add r1, #2
-+ vmov q3, q1
-+ vext.16 q0, q0, q1, #1
-+ vext.16 q1, q1, q8, #1
-+2:
-+ vmul.u16 q11, q0, q9
-+ subs r12, r4
-+ vmla.u16 q11, q2, q10
-+ it cc
-+ addcc r12, #32
-+ vmul.u16 q12, q1, q9
-+ rsb r6, r12, #32
-+ vmla.u16 q12, q3, q10
-+ sub r5, #1
-+ vld1.16 {d16[0]}, [r1]
-+ teq r5, #0
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+ vrshr.u16 q11, q11, #5
-+ vrshr.u16 q12, q12, #5
-+ vst1.16 {q11-q12}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_32_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_32_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r11, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #1
-+ vpush {d8}
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ add sp, #8
-+ mov r10, #8
-+ mov r1, r2
-+1:
-+ bl patch_h_down_4x4_10
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+ bl patch_h_down_4x4_10_continue
-+
-+ add r2, r1, #4*2 @ restore r2, but 4 rows further down left
-+ add r1, r1, #4*2
-+ mov r6, r4
-+ sub r0, #64
-+ subs r10, #1
-+ add r0, r0, r3, lsl #2
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ add sp, #8
-+ ldrh r7, [r7]
-+ mov r10, #-128
-+ vmov.i8 d6, #1<<6
-+1:
-+ push {r2, r10}
-+ bl patch_h_up_4x4_10
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ bl patch_h_up_4x4_10_continue
-+ pop {r2, r10}
-+
-+ vmov r8, s12
-+ sub r0, #64
-+ add r2, #8
-+ add r0, r0, r3, lsl #2
-+ sub r10, r10, r7, lsl #2
-+ vshr.u8 d6, #1
-+ teq r8, #0
-+ bne 1b
-+
-+ pop {r4-r11, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ add r5, r1, #32
-+ vld1.16 {q1-q2}, [r1]
-+ rsb r12, r6, r6, lsl #16
-+ vld1.16 {q3-q4}, [r5]
-+ sub r9, r2, #2
-+ rsb r4, r12, #0
-+ rsb r12, r12, #32 << 16
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ vmov d0, d9
-+ vmov s2, r12
-+ add r10, r0, #32
-+ mov r5, #32
-+1:
-+ vld1.16 {d1[3]}, [r9]
-+ add r8, r7
-+ vmov q11, q4
-+ vmov q10, q3
-+ asr r9, r8, #8
-+ vmov q9, q2
-+ add r9, r2, r9, lsl #1
-+ vmov q8, q1
-+ vext.16 q4, q3, q4, #7
-+ vext.16 q3, q2, q3, #7
-+ vext.16 q2, q1, q2, #7
-+ vext.16 q1, q0, q1, #7
-+2:
-+ vmul.u16 q12, q8, d1[1]
-+ adds r12, r4
-+ vmla.u16 q12, q1, d1[0]
-+ it cc
-+ addcc r12, #32 << 16
-+ vmul.u16 q13, q9, d1[1]
-+ it cc
-+ subcc r12, #32
-+ vmla.u16 q13, q2, d1[0]
-+ sub r5, #1
-+ vmul.u16 q14, q10, d1[1]
-+ teq r5, #0
-+ vmla.u16 q14, q3, d1[0]
-+ vmul.u16 q15, q11, d1[1]
-+ vmla.u16 q15, q4, d1[0]
-+ vmov s2, r12
-+ vrshr.u16 q12, q12, #5
-+ vrshr.u16 q13, q13, #5
-+ vrshr.u16 q14, q14, #5
-+ vrshr.u16 q15, q15, #5
-+ vst1.16 {q12-q13}, [r0], r3
-+ vst1.16 {q14-q15}, [r10], r3
-+ bhi 2b
-+ bne 1b
-+
-+ vpop {d8}
-+ vmov d9, d0
-+ pop {r4-r11, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ add r5, r1, #32
-+ vld1.16 {q1-q2}, [r1]
-+ rsb r12, r6, r6, lsl #16
-+ vld1.16 {q3-q4}, [r5]
-+ add r1, r1, #64
-+ rsb r4, r12, #0
-+ rsb r12, r12, #32 << 16
-+ vmov d1, d9
-+ vmov s1, r12
-+ add r10, r0, #32
-+ mov r5, #32
-+1:
-+ vld1.16 {d0[0]}, [r1]!
-+ vmov q8, q1
-+ vmov q9, q2
-+ vmov q10, q3
-+ vmov q11, q4
-+ vext.16 q1, q1, q2, #1
-+ vext.16 q2, q2, q3, #1
-+ vext.16 q3, q3, q4, #1
-+ vext.16 q4, q4, q0, #1
-+2:
-+ vmul.u16 q12, q1, d0[2]
-+ adds r12, r4
-+ vmla.u16 q12, q8, d0[3]
-+ it cc
-+ addcc r12, #32 << 16
-+ vmul.u16 q13, q2, d0[2]
-+ it cc
-+ subcc r12, #32
-+ vmla.u16 q13, q9, d0[3]
-+ sub r5, #1
-+ vmul.u16 q14, q3, d0[2]
-+ teq r5, #0
-+ vmla.u16 q14, q10, d0[3]
-+ vmul.u16 q15, q4, d0[2]
-+ vmla.u16 q15, q11, d0[3]
-+ vmov s1, r12
-+ vrshr.u16 q12, q12, #5
-+ vrshr.u16 q13, q13, #5
-+ vrshr.u16 q14, q14, #5
-+ vrshr.u16 q15, q15, #5
-+ vst1.16 {q12-q13}, [r0], r3
-+ vst1.16 {q14-q15}, [r10], r3
-+ bhi 2b
-+ bne 1b
-+
-+ vpop {d8}
-+ vmov d9, d1
-+ pop {r4-r11, pc}
-+
-+endfunc
-+
-+
-+
-+@ Generate 4x4 chroma patch
-+@
-+@ In (const)
-+@ r1 Up ptr (_up only)
-+@ r3 Out stride
-+@ r4 Angle add
-+@ r7 Inv angle (_up only)
-+@
-+@ In/Out (updated)
-+@ r0 Out pointer - on exit point to start of next patch horizontally (i.e. r0 + patch width)
-+@ r2 Left ptr - updated
-+@ r6 Angle frac (init to r4 + 32)
-+@ r8 Inv angle accumulator
-+@ q2 Cur Line - load before 1st call for down - set by _up
-+@ q8 Cur Line - load before 1st call for up - set by _down
-+@
-+@ Temps
-+@ r5 Loop counter
-+@ r12
-+@ d0, q1, q12-q15
-+
-+patch_h_down_c_4x4_10:
-+ vld1.16 {q12}, [r2]!
-+ rsb r12, r6, #32
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ mov r5, #4
-+1:
-+ vmov q13, q12
-+ vext.16 q12, q12, q12, #2
-+ vld1.32 {d25[1]}, [r2]!
-+patch_h_down_c_4x4_10_continue:
-+2:
-+ vmov q8, q9
-+ subs r12, r4
-+ vmul.u16 q0, q13, q3
-+ it cc
-+ addcc r12, #32
-+ vmla.u16 q0, q12, q2
-+ rsb r6, r12, #32
-+ vmov q9, q10
-+ sub r5, #1
-+ vmov q10, q11
-+ teq r5, #0
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vrshr.u16 q11, q0, #5
-+ bhi 2b
-+ bne 1b
-+
-+ bcs 3f
-+ vmov q13, q12
-+ vext.16 q12, q12, q12, #2
-+ vld1.32 {d25[1]}, [r2]!
-+3:
-+
-+store_tran_c_4x4_10:
-+T add r6, r0, r3
-+ vzip.32 q8, q10
-+A add r6, r0, r3
-+T lsl r3, #1
-+ vzip.32 q9, q11
-+A add r5, r0, r3, lsl #1
-+T add r5, r0, r3
-+ vst2.32 {d16,d18}, [r0]!
-+A lsl r3, #1
-+ vst2.32 {d17,d19}, [r6], r3
-+ asr r3, #1
-+ vst2.32 {d20,d22}, [r5]
-+ mov r5, #4
-+ vst2.32 {d21,d23}, [r6]
-+ bx lr
-+
-+patch_h_up_c_4x4_10:
-+ vld1.16 {q1}, [r2]
-+ rsb r12, r6, #32
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ mov r5, #4
-+1:
-+ adds r8, r7
-+ vmov q12, q1
-+ it mi
-+ ldrmi r6, [r2, #-4]!
-+ vext.16 q1, q1, q1, #6
-+ itt pl
-+ asrpl r6, r8, #8
-+ ldrpl r6, [r1, r6, lsl #2]
-+ vmov s4, r6
-+patch_h_up_c_4x4_10_continue:
-+2:
-+ vmov q8, q9
-+ subs r12, r4
-+ vmul.u16 q0, q12, q3
-+ it cc
-+ addcc r12, #32
-+ vmla.u16 q0, q1, q2
-+ rsb r6, r12, #32
-+ vmov q9, q10
-+ sub r5, #1
-+ vmov q10, q11
-+ teq r5, #0
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vrshr.u16 q11, q0, #5
-+ bhi 2b
-+ bne 1b
-+
-+ bcs store_tran_c_4x4_10
-+ adds r8, r7
-+ vmov q12, q1
-+ it mi
-+ ldrmi r6, [r2, #-4]!
-+ vext.16 q1, q1, q1, #6
-+ itt pl
-+ asrpl r6, r8, #8
-+ ldrpl r6, [r1, r6, lsl #2]
-+ vmov s4, r6
-+ b store_tran_c_4x4_10
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_c_4_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r8, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #2
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ bl patch_h_down_c_4x4_10
-+ pop {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ sub r8, r7
-+ bl patch_h_up_c_4x4_10
-+ pop {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.16 {q9}, [r1]
-+ sub r1, r2, #4
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ vdup.16 q2, r6
-+ vext.16 q8, q9, q9, #6
-+ sub r8, r7, #128
-+ vld1.32 {d16[0]}, [r1]
-+ vdup.16 q3, r12
-+ mov r5, #3
-+1:
-+ vmul.u16 q0, q9, q3
-+ subs r12, r4
-+ vmla.u16 q0, q8, q2
-+ ittt cc
-+ asrcc r1, r8, #8
-+ addcc r12, #32
-+ addcc r1, r2, r1, lsl #2
-+ vext.16 q10, q8, q8, #6
-+ rsb r6, r12, #32
-+ vmov q11, q8
-+ sub r5, #1
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r8, r7
-+ vld1.32 {d20[0]}, [r1]
-+ teq r5, #0
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmul.u16 q0, q11, q3
-+ subs r12, r4
-+ vmla.u16 q0, q10, q2
-+ ittt cc
-+ asrcc r1, r8, #8
-+ addcc r12, #32
-+ addcc r1, r2, r1, lsl #2
-+ vext.16 q8, q10, q10, #6
-+ rsb r6, r12, #32
-+ vmov q9, q10
-+ sub r5, #1
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r8, r7
-+ vld1.32 {d16[0]}, [r1]
-+ teq r5, #0
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmul.u16 q0, q11, q3
-+ vmla.u16 q0, q10, q2
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r8, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmul.u16 q0, q9, q3
-+ vmla.u16 q0, q8, q2
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ vld1.16 {q9}, [r1]!
-+ rsb r12, r6, #32
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vext.16 q8, q9, q9, #2
-+ vld1.32 {d17[1]}, [r1]!
-+ mov r5, #3
-+1:
-+ vmul.u16 q0, q8, q2
-+ subs r12, r4
-+ vmla.u16 q0, q9, q3
-+ it cc
-+ addcc r12, #32
-+ vext.16 q10, q8, q8, #2
-+ rsb r6, r12, #32
-+ vld1.32 {d21[1]}, [r1]
-+ sub r5, #1
-+ vmov q11, q8
-+ teq r5, #0
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r1, #4
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 1b
-+ beq 4f
-+2:
-+ vmul.u16 q0, q10, q2
-+ subs r12, r4
-+ vmla.u16 q0, q11, q3
-+ it cc
-+ addcc r12, #32
-+ vext.16 q8, q10, q10, #2
-+ rsb r6, r12, #32
-+ vld1.32 {d17[1]}, [r1]
-+ sub r5, #1
-+ vmov q9, q10
-+ teq r5, #0
-+ vrshr.u16 q0, q0, #5
-+ it cc
-+ addcc r1, #4
-+ vdup.16 q2, r6
-+ vdup.16 q3, r12
-+ vst1.16 {q0}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+ bcc 5f
-+3:
-+ vmul.u16 q0, q10, q2
-+ vmla.u16 q0, q11, q3
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r8, pc}
-+4:
-+ bcc 3b
-+5:
-+ vmul.u16 q0, q8, q2
-+ vmla.u16 q0, q9, q3
-+ vrshr.u16 q0, q0, #5
-+ vst1.16 {q0}, [r0]
-+
-+ pop {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_c_8_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r8, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #2
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ mov r1, r2 @ save r2 - r1 unused by patch_down
-+
-+ bl patch_h_down_c_4x4_10
-+ bl patch_h_down_c_4x4_10_continue
-+
-+ add r2, r1, #4*4 @ restore r2, but 4 rows further down left
-+ sub r0, #32
-+ mov r6, r4
-+ add r0, r0, r3, lsl #2
-+
-+ bl patch_h_down_c_4x4_10
-+ bl patch_h_down_c_4x4_10_continue
-+
-+ pop {r4-r8, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ sub r8, r7
-+
-+ push {r2, r8}
-+ bl patch_h_up_c_4x4_10
-+ bl patch_h_up_c_4x4_10_continue
-+ pop {r2, r8}
-+
-+ sub r0, #32
-+ mov r6, r4
-+ add r2, #16
-+ sub r8, r8, r7, lsl #2
-+ add r0, r0, r3, lsl #2
-+
-+ bl patch_h_up_c_4x4_10
-+ bl patch_h_up_c_4x4_10_continue
-+
-+ pop {r4-r8, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ vld1.16 {q0-q1}, [r1]
-+ sub r9, r2, #4
-+ rsb r12, r6, #32
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+ mov r5, #8
-+1:
-+ vld1.32 {d17[1]}, [r9]
-+ add r8, r7
-+ vmov q2, q0
-+ vmov q3, q1
-+ asr r9, r8, #8
-+ vext.16 q1, q0, q1, #6
-+ add r9, r2, r9, lsl #2
-+ vext.16 q0, q8, q0, #6
-+2:
-+ vmul.u16 q11, q2, q10
-+ subs r12, r4
-+ vmla.u16 q11, q0, q9
-+ it cc
-+ addcc r12, #32
-+ vmul.u16 q12, q3, q10
-+ rsb r6, r12, #32
-+ vmla.u16 q12, q1, q9
-+ sub r5, #1
-+ teq r5, #0
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+ vrshr.u16 q11, q11, #5
-+ vrshr.u16 q12, q12, #5
-+ vst1.16 {q11-q12}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r8, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ add r5, r1, #32
-+ vld1.16 {q0-q1}, [r1]!
-+ rsb r12, r6, #32
-+ vld1.32 {d16[0]}, [r5]
-+ mov r5, #8
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+1:
-+ vmov q2, q0
-+ add r1, #4
-+ vmov q3, q1
-+ vext.16 q0, q0, q1, #2
-+ vext.16 q1, q1, q8, #2
-+2:
-+ vmul.u16 q11, q0, q9
-+ subs r12, r4
-+ vmla.u16 q11, q2, q10
-+ it cc
-+ addcc r12, #32
-+ vmul.u16 q12, q1, q9
-+ rsb r6, r12, #32
-+ vmla.u16 q12, q3, q10
-+ sub r5, #1
-+ vld1.32 {d16[0]}, [r1]
-+ teq r5, #0
-+ vdup.16 q9, r6
-+ vdup.16 q10, r12
-+ vrshr.u16 q11, q11, #5
-+ vrshr.u16 q12, q12, #5
-+ vst1.16 {q11-q12}, [r0], r3
-+ bhi 2b
-+ bne 1b
-+
-+ pop {r4-r8, pc}
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_angular_c_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride [r3]
-+@ unsigned int mode [sp, #0] 2..34
-+
-+function ff_hevc_rpi_pred_angular_c_16_neon_10, export=1
-+ ldr r12, [sp]
-+ push {r4-r10, lr}
-+ ADRT r4, angle_2 - 2
-+ ADRT r7, inv_angle - 11*2
-+ add r7, r7, r12, lsl #1
-+ lsl r3, #2
-+ vpush {d8}
-+ ldrsb r6, [r4, r12]
-+ cmp r12, #26
-+ ldrsb r4, [r4, r12]
-+ bge 26f
-+ cmp r12, #18
-+ bge 18f
-+ cmp r12, #10
-+ bge 10f
-+
-+@ Down of Horizontal - works down left
-+ add sp, #8
-+ mov r10, #4
-+ mov r1, r2
-+1:
-+ bl patch_h_down_c_4x4_10
-+ bl patch_h_down_c_4x4_10_continue
-+ bl patch_h_down_c_4x4_10_continue
-+ bl patch_h_down_c_4x4_10_continue
-+
-+ add r2, r1, #4*4 @ restore r2, but 4 rows further down left
-+ add r1, r1, #4*4
-+ mov r6, r4
-+ sub r0, #64
-+ subs r10, #1
-+ add r0, r0, r3, lsl #2
-+ bne 1b
-+
-+ pop {r4-r10, pc}
-+
-+@ Up of Horizontal - works down up
-+10:
-+ add sp, #8
-+ mov r10, #4
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ sub r8, r7
-+2:
-+ push {r2, r8}
-+ bl patch_h_up_c_4x4_10
-+ bl patch_h_up_c_4x4_10_continue
-+ bl patch_h_up_c_4x4_10_continue
-+ bl patch_h_up_c_4x4_10_continue
-+ pop {r2, r8}
-+
-+ sub r0, #64
-+ mov r6, r4
-+ add r2, #16
-+ sub r8, r8, r7, lsl #2
-+ add r0, r0, r3, lsl #2
-+ subs r10, #1
-+ bne 2b
-+
-+ pop {r4-r10, pc}
-+
-+@ Left of vertical - works down left
-+18:
-+ add r5, r1, #32
-+ vld1.16 {q1-q2}, [r1]
-+ rsb r12, r6, r6, lsl #16
-+ vld1.16 {q3-q4}, [r5]
-+ sub r9, r2, #4
-+ rsb r4, r12, #0
-+ rsb r12, r12, #32 << 16
-+ ldrh r7, [r7]
-+ mov r8, #-128
-+ vmov d0, d9
-+ vmov s2, r12
-+ add r10, r0, #32
-+ mov r5, #16
-+1:
-+ vld1.32 {d1[1]}, [r9]
-+ add r8, r7
-+ vmov q11, q4
-+ vmov q10, q3
-+ asr r9, r8, #8
-+ vmov q9, q2
-+ add r9, r2, r9, lsl #2
-+ vmov q8, q1
-+ vext.16 q4, q3, q4, #6
-+ vext.16 q3, q2, q3, #6
-+ vext.16 q2, q1, q2, #6
-+ vext.16 q1, q0, q1, #6
-+2:
-+ vmul.u16 q12, q8, d1[1]
-+ adds r12, r4
-+ vmla.u16 q12, q1, d1[0]
-+ it cc
-+ addcc r12, #32 << 16
-+ vmul.u16 q13, q9, d1[1]
-+ it cc
-+ subcc r12, #32
-+ vmla.u16 q13, q2, d1[0]
-+ sub r5, #1
-+ vmul.u16 q14, q10, d1[1]
-+ teq r5, #0
-+ vmla.u16 q14, q3, d1[0]
-+ vmul.u16 q15, q11, d1[1]
-+ vmla.u16 q15, q4, d1[0]
-+ vmov s2, r12
-+ vrshr.u16 q12, q12, #5
-+ vrshr.u16 q13, q13, #5
-+ vrshr.u16 q14, q14, #5
-+ vrshr.u16 q15, q15, #5
-+ vst1.16 {q12-q13}, [r0], r3
-+ vst1.16 {q14-q15}, [r10], r3
-+ bhi 2b
-+ bne 1b
-+
-+ vpop {d8}
-+ vmov d9, d0
-+ pop {r4-r10, pc}
-+
-+@ Right of vertical - works along top - left unused
-+26:
-+ add r5, r1, #32
-+ vld1.16 {q1-q2}, [r1]
-+ rsb r12, r6, r6, lsl #16
-+ vld1.16 {q3-q4}, [r5]
-+ add r1, r1, #64
-+ rsb r4, r12, #0
-+ rsb r12, r12, #32 << 16
-+ vmov d1, d9
-+ vmov s1, r12
-+ add r10, r0, #32
-+ mov r5, #16
-+1:
-+ vld1.32 {d0[0]}, [r1]!
-+ vmov q8, q1
-+ vmov q9, q2
-+ vmov q10, q3
-+ vmov q11, q4
-+ vext.16 q1, q1, q2, #2
-+ vext.16 q2, q2, q3, #2
-+ vext.16 q3, q3, q4, #2
-+ vext.16 q4, q4, q0, #2
-+2:
-+ vmul.u16 q12, q1, d0[2]
-+ adds r12, r4
-+ vmla.u16 q12, q8, d0[3]
-+ it cc
-+ addcc r12, #32 << 16
-+ vmul.u16 q13, q2, d0[2]
-+ it cc
-+ subcc r12, #32
-+ vmla.u16 q13, q9, d0[3]
-+ sub r5, #1
-+ vmul.u16 q14, q3, d0[2]
-+ teq r5, #0
-+ vmla.u16 q14, q10, d0[3]
-+ vmul.u16 q15, q4, d0[2]
-+ vmla.u16 q15, q11, d0[3]
-+ vmov s1, r12
-+ vrshr.u16 q12, q12, #5
-+ vrshr.u16 q13, q13, #5
-+ vrshr.u16 q14, q14, #5
-+ vrshr.u16 q15, q15, #5
-+ vst1.16 {q12-q13}, [r0], r3
-+ vst1.16 {q14-q15}, [r10], r3
-+ bhi 2b
-+ bne 1b
-+
-+ vpop {d8}
-+ vmov d9, d1
-+ pop {r4-r10, pc}
-+
-+endfunc
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-new file mode 100644
-index 0000000000..75a1789c25
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_dc_neon.S
-@@ -0,0 +1,695 @@
-+/*
-+ * Copyright (c) 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ ldr r2, [r2]
-+ vld1.32 {d0[0]}, [r1]
-+ mov r1, #2
-+ vmov s1, r2
-+ vmov s2, r2
-+ vmov.i16 q2, #3
-+ add r2, r0, r3
-+ vaddl.u8 q1, d0, d1 @ d2[0] = top[0] + left[0]
-+ lsl r3, #1
-+ vmovl.u8 q0, d0
-+ vmov.i64 d7, #0xffff
-+ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
-+ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
-+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+ @ top line gets some smoothing
-+ @ (top[i] + 3*dc + 2) >> 2
-+ @ as does left
-+ @ top_line[0] is extra special
-+ @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+ vmov.i64 d7, #0xff
-+ vpadd.i16 d6, d6 @ 1 (all the same)
-+ vrshr.u16 d6, #3
-+ vmla.i16 q0, q2, d6[0]
-+ vdup.8 d6, d6[0]
-+ vrshrn.i16 d0, q0, #2
-+
-+ @ Store top line
-+ vst1.32 {d0[0]}, [r0], r3
-+
-+ @ Store the rest
-+ vshr.u64 d1, d0, #5*8
-+ vshr.u64 d2, d0, #6*8
-+ vshr.u64 d3, d0, #7*8
-+ vbif d1, d6, d7
-+ vbif d2, d6, d7
-+ vst1.32 {d1[0]}, [r2], r3
-+ vbif d3, d6, d7
-+ vst1.32 {d2[0]}, [r0]
-+ vst1.32 {d3[0]}, [r2]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {d0}, [r1]
-+ vld1.8 {d1}, [r2]
-+A add r2, r0, r3, lsl #1
-+A lsl r3, #2
-+T lsl r3, #1
-+T add r2, r0, r3
-+T lsl r3, #1
-+ vaddl.u8 q0, d0, d1
-+ vadd.i16 d0, d1 @ d0 has 2 val pairs
-+ vpadd.i32 d2, d0, d0 @ This adds U & V separately
-+ vpadd.i32 d3, d0, d0
-+ vrshrn.u16 d0, q1, #3
-+
-+ @ Store
-+ vst1.8 {d0}, [r0], r3
-+ vst1.8 {d0}, [r2], r3
-+ vst1.8 {d0}, [r0]
-+ vst1.8 {d0}, [r2]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {d0}, [r1]
-+ mov r1, #2
-+ vld1.8 {d16}, [r2]
-+ vmov.i16 q2, #3
-+ vmov.i64 d7, #0xffff
-+ vaddl.u8 q1, d0, d16 @ d2[0] = top[0] + left[0]
-+ vmovl.u8 q0, d0
-+ vadd.i16 d6, d2, d3 @ d6 has 4 vals
-+ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
-+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
-+
-+ @ top line gets some smoothing
-+ @ (top[i] + 3*dc + 2) >> 2
-+ @ as does left
-+ @ top_line[0] is extra special
-+ @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+ vmov.i64 d7, #0xff
-+ vmovl.u8 q1, d16
-+ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
-+ vpadd.i16 d6, d6 @ 1 (all the same)
-+ vrshr.u16 d6, #4
-+ vmla.i16 q1, q2, d6[0]
-+ vmla.i16 q0, q2, d6[0]
-+ vdup.8 d6, d6[0]
-+ vrshrn.i16 d2, q1, #2
-+ vrshrn.i16 d0, q0, #2
-+
-+ @ Store top line
-+ vst1.8 {d0}, [r0], r3
-+
-+ @ Store the rest
-+ vshr.u64 d2, #8
-+ vbit d6, d2, d7
-+ vshr.u64 d2, #8
-+ vst1.8 {d6}, [r0], r3
-+ mov r1, #6
-+1:
-+ vbit d6, d2, d7
-+ vshr.u64 d2, #8
-+ vst1.8 {d6}, [r0], r3
-+ subs r1, #2
-+ vbit d6, d2, d7
-+ vshr.u64 d2, #8
-+ vst1.8 {d6}, [r0], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {q0}, [r1]
-+ mov r1, #8
-+ vld1.8 {q1}, [r2]
-+T lsl r3, #1
-+ vaddl.u8 q0, d0, d1
-+A add r2, r0, r3, lsl #1
-+A lsl r3, #2
-+T add r2, r0, r3
-+T lsl r3, #1
-+ vaddl.u8 q1, d2, d3
-+ vadd.i16 q1, q0
-+ vadd.i16 d3, d2 @ d3 has 2 val pairs
-+ vpadd.i32 d2, d3, d3 @ This add U & V separately
-+ vpadd.i32 d3, d3, d3
-+ vrshrn.u16 d0, q1, #4
-+ vrshrn.u16 d1, q1, #4
-+
-+ @ Store
-+1:
-+ vst1.8 {q0}, [r0], r3
-+ subs r1, #4
-+ vst1.8 {q0}, [r2], r3
-+ vst1.8 {q0}, [r0], r3
-+ vst1.8 {q0}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {q8}, [r1]
-+ mov r1, #2
-+ vld1.8 {q9}, [r2]
-+ vaddl.u8 q10, d16, d17
-+ vaddl.u8 q11, d16, d18
-+ vaddl.u8 q0, d18, d19
-+ vmov.i16 q1, #3
-+ vadd.i16 q10, q0
-+ vmovl.u8 q0, d18
-+ vadd.i16 d20, d21
-+ vmov.i16 d2[0], r1 @ 2, 3, 3, 3...
-+
-+ @ top line gets some smoothing
-+ @ (top[i] + 3*dc + 2) >> 2
-+ @ as does left
-+ @ top_line[0] is extra special
-+ @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+ vmovl.u8 q2, d16
-+ vmovl.u8 q9, d19
-+ vpadd.i16 d20, d20 @ 2 (top & bottom of vector the same)
-+ vmov.i64 d7, #0xffff
-+ vmovl.u8 q8, d17
-+ vbit d4, d22, d7 @ q2 = top[0]+left[0], top[1..7]
-+ vmov.i64 d7, #0xff
-+ vpadd.i16 d20, d20 @ 1 (all the same)
-+ vrshr.u16 d21, d20, #5
-+ vrshr.u16 d20, d20, #5
-+ vmla.i16 q0, q10, d2[1]
-+ vmla.i16 q9, q10, d2[1]
-+ vmla.i16 q2, q10, q1
-+ vmla.i16 q8, q10, d2[1]
-+ vdup.8 q1, d20[0]
-+ vrshrn.i16 d0, q0, #2
-+ vrshrn.i16 d1, q9, #2
-+ vrshrn.i16 d4, q2, #2
-+ vrshrn.i16 d5, q8, #2
-+ vext.8 q0, q0, q0, #1
-+
-+ @ Store top line
-+ vst1.8 {q2}, [r0], r3
-+
-+ @ Store the rest
-+ mov r1, #15
-+1:
-+ vbit d2, d0, d7
-+ vext.8 q0, q0, q0, #1
-+ subs r1, #1
-+ vst1.8 {q1}, [r0], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {q0-q1}, [r1]
-+ mov r1, #16
-+ vld1.8 {q2-q3}, [r2]
-+T lsl r3, #1
-+ vaddl.u8 q0, d0, d1
-+A add r2, r0, r3, lsl #1
-+T add r2, r0, r3
-+ vaddl.u8 q1, d2, d3
-+A lsl r3, #2
-+T lsl r3, #1
-+ vaddl.u8 q2, d4, d5
-+ vaddl.u8 q3, d6, d7
-+ vadd.i16 q0, q1
-+ vadd.i16 q2, q3
-+ vadd.i16 q0, q2
-+ vadd.i16 d0, d1 @ d0 has 2 val pairs
-+ vpadd.i32 d4, d0, d0 @ This adds U & V separately
-+ vpadd.i32 d5, d0, d0
-+ vrshrn.u16 d0, q2, #5
-+ vrshrn.u16 d1, q2, #5
-+ vrshrn.u16 d2, q2, #5
-+ vrshrn.u16 d3, q2, #5
-+
-+ @ Store
-+1:
-+ vst1.8 {q0-q1}, [r0], r3
-+ subs r1, #2
-+ vst1.8 {q0-q1}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_32_neon_8, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {q0-q1}, [r1]
-+ mov r1, #32
-+ vld1.8 {q2-q3}, [r2]
-+ add r2, r0, r3
-+ vaddl.u8 q0, d0, d1
-+ lsl r3, #1
-+ vaddl.u8 q1, d2, d3
-+ vaddl.u8 q2, d4, d5
-+ vaddl.u8 q3, d6, d7
-+ vadd.i16 q0, q1
-+ vadd.i16 q2, q3
-+ vadd.i16 q0, q2
-+ vadd.i16 d0, d1 @ d0 has 4 vals
-+ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
-+ vpadd.i16 d4, d0, d0 @ 1 (all the same)
-+ vpadd.i16 d5, d0, d0
-+ vrshrn.u16 d0, q2, #6
-+ vrshrn.u16 d1, q2, #6
-+ vrshrn.u16 d2, q2, #6
-+ vrshrn.u16 d3, q2, #6
-+
-+ @ Store
-+1:
-+ vst1.8 {q0-q1}, [r0], r3
-+ subs r1, #2
-+ vst1.8 {q0-q1}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ -----------------------------------------------------------------------------
-+@
-+@ 10 Bit versions
-+@
-+@ There is no actual bit depth dependency in this code except that our
-+@ intermediate results will overflow the 16 bits they are stored in
-+@ All there functions are good to 10 bits - with the worst case being
-+@ in dc_32 where we use all 16 bits.
-+
-+
-+@ ff_hevc_rpi_pred_dc_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_4_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ vld1.16 {d0}, [r1]
-+ mov r1, #2
-+ vld1.16 {d1}, [r2]
-+T lsl r3, #1
-+ vmov.i16 q2, #3
-+A add r2, r0, r3, lsl #1
-+T add r2, r0, r3
-+ vadd.u16 d2, d0, d1 @ d2[0] = top[0] + left[0]
-+A lsl r3, #2
-+T lsl r3, #1
-+ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
-+ vmov.i64 d7, #0xffff
-+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..3], left[0..3]
-+
-+ @ top line gets some smoothing
-+ @ (top[i] + 3*dc + 2) >> 2
-+ @ as does left
-+ @ top_line[0] is extra special
-+ @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+ vpadd.i16 d6, d2, d2 @ 2 (top & bottom of vector the same)
-+ vpadd.i16 d6, d6 @ 1 (all the same)
-+ vrshr.u16 d6, #3
-+ vmla.i16 q0, q2, d6[0]
-+ vrshr.u16 q0, #2
-+
-+ @ Store top line
-+ vst1.16 {d0}, [r0], r3
-+
-+ @ Store the rest
-+ vshr.u64 d3, d1, #1*16
-+ vshr.u64 d4, d1, #2*16
-+ vshr.u64 d5, d1, #3*16
-+ vbif d3, d6, d7
-+ vbif d4, d6, d7
-+ vst1.16 {d3}, [r2], r3
-+ vbif d5, d6, d7
-+ vst1.16 {d4}, [r0]
-+ vst1.16 {d5}, [r2]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3] (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_4_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ vld1.8 {q0}, [r1]
-+ vld1.8 {q1}, [r2]
-+A add r2, r0, r3, lsl #2
-+A lsl r3, #3
-+T lsl r3, #2
-+T add r2, r0, r3
-+T lsl r3, #1
-+ vadd.i16 q0, q1
-+ vadd.i16 d0, d1 @ d0 has 2 val pairs
-+ vpadd.i32 d2, d0, d0 @ This adds U & V separately
-+ vpadd.i32 d3, d0, d0
-+ vrshr.u16 q0, q1, #3
-+
-+ vst1.16 {q0}, [r0], r3
-+ vst1.16 {q0}, [r2], r3
-+ vst1.16 {q0}, [r0]
-+ vst1.16 {q0}, [r2]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_8_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ vld1.16 {q0}, [r1]
-+ mov r1, #2
-+ vld1.16 {q8}, [r2]
-+T lsl r3, #1
-+ vmov.i16 q2, #3
-+A add r2, r0, r3, lsl #1
-+T add r2, r0, r3
-+ vadd.i16 q1, q0, q8 @ q1[0] = top[0] + left[0]
-+A lsl r3, #2
-+T lsl r3, #1
-+ vmov.i64 d7, #0xffff
-+ vmov.16 d4[0], r1 @ 2, 3, 3, 3...
-+ vadd.i16 d6, d2, d3 @ d6 has 4 vals
-+ vbit d0, d2, d7 @ q0 = top[0]+left[0], top[1..7]
-+
-+ @ top line gets some smoothing
-+ @ (top[i] + 3*dc + 2) >> 2
-+ @ as does left
-+ @ top_line[0] is extra special
-+ @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+ vpadd.i16 d6, d6 @ 2 (top & bottom of vector the same)
-+ vpadd.i16 d6, d6 @ 1 (all the same)
-+ vrshr.u16 d6, #4
-+ vmla.i16 q8, q2, d6[0]
-+ vmla.i16 q0, q2, d6[0]
-+ vdup.16 q2, d6[0]
-+ vdup.16 q9, d6[0]
-+ vrshr.u16 q8, q8, #2
-+ vrshr.u16 q0, q0, #2
-+ vext.16 q1, q8, q8, #1
-+
-+ @ Store top line
-+ vst1.16 {q0}, [r0], r3
-+
-+ @ Store the rest
-+ vbit d18, d2, d7
-+ vst1.16 {q9}, [r2], r3
-+ mov r1, #6
-+1:
-+ vext.16 q8, q8, q8, #2
-+ subs r1, #2
-+ vext.16 q1, q1, q1, #2
-+ vbit d4, d16, d7
-+ vst1.16 {q2}, [r0], r3
-+ vbit d18, d2, d7
-+ vst1.16 {q9}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3] (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_8_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ vld1.16 {q0-q1}, [r1]
-+ mov r1, #8
-+ vld1.16 {q2-q3}, [r2]
-+T lsl r3, #2
-+ vadd.i16 q1, q0
-+A add r2, r0, r3, lsl #2
-+A lsl r3, #3
-+T add r2, r0, r3
-+T lsl r3, #1
-+ vadd.i16 q2, q3
-+ vadd.i16 q1, q2
-+ vadd.i16 d3, d2 @ d3 has 2 val pairs
-+ vpadd.i32 d2, d3, d3 @ This add U & V separately
-+ vpadd.i32 d3, d3, d3
-+ vrshr.u16 q0, q1, #4
-+ vrshr.u16 q1, q1, #4
-+
-+ @ Store
-+1:
-+ vst1.8 {q0-q1}, [r0], r3
-+ subs r1, #2
-+ vst1.8 {q0-q1}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_dc_16_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ vld1.16 {q8-q9}, [r1]
-+ mov r1, #2
-+ vld1.16 {q10-q11}, [r2]
-+ lsl r3, #1 @ stride given in pels
-+ vadd.i16 q0, q8, q9
-+ vadd.i16 q1, q10, q11
-+ vmov.i16 q3, #3
-+ vadd.i16 q1, q0
-+ vadd.i16 d0, d16, d20
-+ vmov.i64 d31, #0xffff
-+ vadd.i16 d3, d2
-+ vmov.16 d6[0], r1 @ 2, 3, 3, 3...
-+
-+ @ top line gets some smoothing
-+ @ (top[i] + 3*dc + 2) >> 2
-+ @ as does left
-+ @ topline[0] is extra special
-+ @ (top[0] + left[0] + 2*dc + 2) >> 2
-+
-+ vbit d16, d0, d31 @ q8 = top[0]+left[0], top[1..7]
-+ vpadd.i16 d3, d3 @ 2 (top & bottom of vector the same)
-+ vpadd.i16 d3, d3 @ 1 (all the same)
-+ vrshr.u16 d2, d3, #5
-+ vrshr.u16 d3, d3, #5
-+ vmov q0, q1
-+ vmla.i16 q10, q1, d6[1]
-+ vmla.i16 q11, q1, d6[1]
-+ vmla.i16 q8, q1, q3
-+ vmla.i16 q9, q1, d6[1]
-+ vrshr.u16 q2, q10, #2
-+ vrshr.u16 q3, q11, #2
-+ vrshr.u16 q8, #2
-+ vrshr.u16 q9, #2
-+ vext.16 q2, q2, q2, #1
-+ mov r1, #7<<29
-+
-+ @ Store top line
-+ vst1.16 {q8-q9}, [r0], r3
-+
-+ @ Store the rest
-+1:
-+ vbit d0, d4, d31
-+ vext.16 q2, q2, q2, #1
-+ subs r1, #1<<29
-+ vst1.16 {q0-q1}, [r0], r3
-+ bne 1b
-+1:
-+ vbit d0, d6, d31
-+ vext.16 q3, q3, q3, #1
-+ subs r1, #1<<29
-+ vst1.16 {q0-q1}, [r0], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_c_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3] (In pels - needs * 4)
-+
-+function ff_hevc_rpi_pred_dc_c_16_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ vldm r1, {q0-q3}
-+ vldm r2, {q8-q11}
-+ vadd.i16 q0, q1
-+ mov r1, #16
-+ vadd.i16 q2, q3
-+ add r2, r0, #32
-+ vadd.i16 q8, q9
-+ lsl r3, #2
-+ vadd.i16 q10, q11
-+ vadd.u16 q0, q2
-+ vadd.u16 q8, q10
-+ vadd.i16 q0, q8
-+ vadd.i16 d0, d1 @ d0 has 2 val pairs
-+ vpadd.i32 d4, d0, d0 @ This adds U & V separately
-+ vpadd.i32 d5, d0, d0
-+ vrshr.u16 q0, q2, #5
-+ vrshr.u16 q1, q2, #5
-+
-+ @ Store
-+1:
-+ vst1.16 {q0-q1}, [r0], r3
-+ subs r1, #1
-+ vst1.16 {q0-q1}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_dc_32_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3] (In pels)
-+
-+function ff_hevc_rpi_pred_dc_32_neon_10, export=1
-+
-+ @ Average the els of top & left
-+ @ With 10 bits we are (just) safe from overflow in i16
-+ vldm r1, {q0-q3}
-+ vldm r2, {q8-q11}
-+ vadd.i16 q0, q1
-+ mov r1, #32
-+ vadd.i16 q2, q3
-+ add r2, r0, #32
-+ vadd.i16 q8, q9
-+ lsl r3, #1
-+ vadd.i16 q10, q11
-+ vadd.u16 q0, q2
-+ vadd.u16 q8, q10
-+ vadd.i16 q0, q8
-+ vadd.i16 d0, d1 @ d0 has 4 vals
-+ vpadd.i16 d0, d0 @ 2 (top & bottom the same)
-+ vpadd.i16 d4, d0, d0 @ 1 (all the same)
-+ vpadd.i16 d5, d0, d0
-+ vrshr.u16 q0, q2, #6
-+ vrshr.u16 q1, q2, #6
-+
-+ @ Store
-+1:
-+ vst1.16 {q0-q1}, [r0], r3
-+ subs r1, #1
-+ vst1.16 {q0-q1}, [r2], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-new file mode 100644
-index 0000000000..21cd28c709
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_filter_neon.S
-@@ -0,0 +1,872 @@
-+/*
-+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ All functions have the call
-+@
-+@ int ff_hevc_rpi_intra_filter_N_neon_PW(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+@
-+@ Assumptions:
-+@ (that wouldn't apply to all frame layoouts but do apply to sand, so beware
-+@ if reuseing this code)
-+@
-+@ Min ctb size is 8 so we don't need to worry about tr_size or dl_size for
-+@ N==4, but do for chroma N>=8. As we share Y/C fns that means we can ignore
-+@ N==8,PW=8 (chroma always PW>8) but have to cope for larger
-+@
-+@ We always have at least 64 pixel H frame width rounding - this lets us
-+@ load UR widthout having to worry about exactly how many pixels are actually
-+@ within the frame. As partial loads will only occur very occasionally this
-+@ should be a win in nearly all cases.
-+@
-+@ 16 bit fns can be used as 8 bit chroma fns as chroma never filters
-+@ so we do no maths on the contents
-+@
-+@ No filtering in 32bit fns as they are chroma only
-+
-+
-+.equ AVAIL_UR, 1
-+.equ AVAIL_U, 2
-+.equ AVAIL_UL, 4
-+.equ AVAIL_L, 8
-+.equ AVAIL_DL, 16
-+
-+.equ FILTER_LIGHT, 0x40
-+.equ FILTER_STRONG, 0x80
-+
-+.equ AVAIL_S_UR_N_U_C, 32 - 1
-+.equ AVAIL_S_U_N_UL_C, 32 - 2
-+.equ AVAIL_S_UL_N_L_C, 32 - 3
-+.equ AVAIL_S_L_N_DL_C, 32 - 4
-+
-+.equ AVAIL_S_U_DL_CPSR, 31 - 4 @ Shift for u..dl to go into flags via cpsr
-+
-+@ On entry
-+@ r2 req
-+@ r3 avail
-+@ [sp, #sp_offset...] args
-+@
-+@ On Exit:
-+@
-+@ Extend values:
-+@ d_l scalar contains value for L & DL
-+@ if DL avail then this is is DL[0] so we don't need to load that
-+@ d_ul scalar containing value for UL
-+@ d_u scalar containing value for U
-+@ d_ur scalar containing value for UR
-+@ If DL avail then d_l == b_dl elif L avail then d_l == a_l else...
-+@ This means that L-light-filter works even if nreq DL (we never filter
-+@ req-DL without req-L, but we do filter req-L without req-DL)
-+@ If UR avail then d_ur == a_ur so U-filter good too
-+@
-+@ Data load pointers (only load if req & avail):
-+@ r4 DL + stride
-+@ r10 L
-+@ r6 U
-+@ r5 UR
-+@
-+@ Others:
-+@ r2 req
-+@ r7 req & avail
-+@ r3 L + stride
-+@ r8 DL + stride * 2
-+@ r9 stride * 2
-+@ cs Load U
-+@ mi Load UR
-+@
-+@ Clobbered:
-+@ r12
-+
-+.macro load_pointers pw_s, log2_s, sp_offset, d_type, d_l, d_ul, d_u, d_ur
-+
-+.equ src_l\@, \sp_offset + 0
-+.equ src_u\@, \sp_offset + 4
-+.equ src_ur\@, \sp_offset + 8
-+.equ stride\@, \sp_offset + 12
-+.equ pw\@, (1 << \pw_s) @ pel width in bytes
-+.equ b_size\@, (1 << (\pw_s + \log2_s)) @ size in bytes
-+
-+@ r9 stride
-+@ r7 = ab_ul, r6 = a_u, r5 = a_ur
-+@ r4 = b_dl, r10 = b_l, r8 = b_u
-+
-+ ldr r5, [sp, #src_ur\@]
-+ lsl r12, r3, #AVAIL_S_U_DL_CPSR
-+ ldr r10, [sp, #src_l\@]
-+ ldr r9, [sp, #stride\@]
-+ ldr r6, [sp, #src_u\@]
-+
-+ @ This is quite a slow instruction but it replaces
-+ @ a decent number of tests that yield a max of 2 flags/op
-+ @ It is annoying we can't branch on Q!
-+ @ If L navail (ne) then DL must be navail (pl)
-+ msr APSR_nzcvq, r12 @ n=dl, z=l, c=ul, v=u, q=ur
-+
-+ mov r4, r5
-+ sub r7, r10, r9
-+ it vs
-+ movvs r4, r6
-+ add r8, r6, #b_size\@ - pw\@
-+ it cs
-+ movcs r4, r7
-+ ite ne
-+ movne r10, r4
-+ addeq r4, r7, r9, lsl #\log2_s
-+ it cc
-+ movcc r7, r10
-+ it mi
-+ addmi r4, r10, r9, lsl #\log2_s
-+ vld1.\d_type {\d_ul}, [r7]
-+ itt vc
-+ movvc r8, r7
-+ movvc r6, r7
-+ vld1.\d_type {\d_l }, [r4], r9
-+ tst r3, #AVAIL_UR
-+ vld1.\d_type {\d_u }, [r6]
-+ it eq
-+ moveq r5, r8
-+ and r7, r2, r3
-+ add r8, r4, r9
-+ vld1.\d_type {\d_ur}, [r5]
-+ lsls r12, r7, #AVAIL_S_UR_N_U_C
-+ add r3, r10, r9
-+ lsl r9, #1
-+.endm
-+
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_8(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set pw_s, 0
-+.set pw, (1 << pw_s)
-+.set log2_s, 2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_8, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 8, d0[], d31[7], d1[], d2[]
-+
-+ it cs
-+ vldrcs s2, [r6]
-+ ite pl
-+ vmovpl s3, s4
-+ vldrmi s3, [r5]
-+
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ add r12, r0, #-pw
-+ bpl 1f
-+
-+ vld1.8 {d0[0]}, [r10], r9
-+ vld1.8 {d0[1]}, [r3], r9
-+ vld1.8 {d0[2]}, [r10]
-+ vld1.8 {d0[3]}, [r3]
-+1:
-+ bcc 1f
-+ vld1.8 {d0[5]}, [r4], r9
-+ vld1.8 {d0[6]}, [r8]
-+ vld1.8 {d0[7]}, [r4]
-+1:
-+ vstr d1, [r1] @ Up
-+ vst1.8 {d31[7]}, [r12]
-+ vstr d0, [r0] @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_16(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set pw_s, 1
-+.set pw, (1 << pw_s)
-+.set log2_s, 2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_16, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], d2[], d3[]
-+
-+ it cs
-+ vldrcs d2, [r6]
-+ it mi
-+ vldrmi d3, [r5]
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ add r12, r0, #-pw
-+ bpl 1f
-+ vld1.16 {d0[0]}, [r10], r9
-+ vld1.16 {d0[1]}, [r3], r9
-+ vld1.16 {d0[2]}, [r10]
-+ vld1.16 {d0[3]}, [r3]
-+1:
-+ bcc 1f
-+ vld1.16 {d1[1]}, [r4], r9
-+ vld1.16 {d1[2]}, [r8]
-+ vld1.16 {d1[3]}, [r4]
-+1:
-+ vst1.16 {q1}, [r1] @ Up
-+ vst1.16 {d31[3]}, [r12]
-+ vst1.16 {q0}, [r0] @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_8(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set pw_s, 0
-+.set pw, (1 << pw_s)
-+.set log2_s, 3
-+
-+function ff_hevc_rpi_intra_filter_8_neon_8, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 8, "d0[],d1[]", d31[7], d4[], d5[]
-+
-+ it cs
-+ vldrcs d4, [r6]
-+ it mi
-+ vldrmi d5, [r5]
-+
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ bpl 1f
-+ vld1.8 {d0[0]}, [r10], r9
-+ vld1.8 {d0[1]}, [r3], r9
-+ vld1.8 {d0[2]}, [r10], r9
-+ vld1.8 {d0[3]}, [r3], r9
-+ vld1.8 {d0[4]}, [r10], r9
-+ vld1.8 {d0[5]}, [r3], r9
-+ vld1.8 {d0[6]}, [r10]
-+ vld1.8 {d0[7]}, [r3]
-+1:
-+ bcc 1f
-+ vld1.8 {d1[1]}, [r4], r9
-+ vld1.8 {d1[2]}, [r8], r9
-+ vld1.8 {d1[3]}, [r4], r9
-+ vld1.8 {d1[4]}, [r8], r9
-+ vld1.8 {d1[5]}, [r4], r9
-+ vld1.8 {d1[6]}, [r8]
-+ vld1.8 {d1[7]}, [r4]
-+1:
-+ tst r2, #FILTER_LIGHT
-+ add r12, r0, #-pw
-+ beq 10f
-+
-+ @ Luma light filter
-+ vext.8 q8, q15, q2, #15
-+ vext.8 q12, q15, q0, #15
-+ vaddl.u8 q9, d17, d5
-+ vaddl.u8 q8, d16, d4
-+ vaddl.u8 q13, d25, d1
-+ vaddl.u8 q12, d24, d0
-+ vmov.u8 r3, d5[7] @ Save final pel
-+ vmov.u8 r2, d1[7] @ Save final pel
-+
-+ vext.16 q2, q8, q9, #1
-+ vext.16 q3, q9, q9, #1
-+ vext.16 q0, q12, q13, #1
-+ vext.16 q1, q13, q13, #1
-+ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
-+ vadd.u16 q2, q8
-+ vadd.u16 q3, q9
-+ vadd.u16 q0, q12
-+ vadd.u16 q1, q13
-+
-+ vrshrn.u16 d4, q2, #2
-+ vrshrn.u16 d5, q3, #2
-+ vrshrn.u16 d0, q0, #2
-+ vrshrn.u16 d1, q1, #2
-+ vrshr.u16 d30, #2
-+ vmov.u8 d5[7], r3 @ Restore final pel
-+ vmov.u8 d1[7], r2 @ Restore final pel
-+ vdup.u8 d31, d30[0] @ d31[3] = d30[0]
-+
-+10:
-+ vst1.8 {q2 }, [r1] @ Up
-+ vst1.8 {d31[7]}, [r12] @ Up-left
-+ vst1.8 {q0 }, [r0] @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_16(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set ur_size, sp_base + 16
-+.set dl_size, sp_base + 20
-+.set pw_s, 1
-+.set pw, (1 << pw_s)
-+.set log2_s, 3
-+.set p_size, (1 << log2_s) @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_16, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d4[],d5[]", "d6[],d7[]"
-+
-+ it cs
-+ vldmcs r6, {d4, d5}
-+ ldr r12, [sp, #ur_size]
-+ bpl 1f
-+ cmp r12, #4
-+ vldm r5, {d6, d7}
-+ bgt 1f
-+ vdup.16 d7, d6[3]
-+1:
-+ lsls r12, r7, #AVAIL_S_L_N_DL_C
-+ vdup.16 q1, d0[0]
-+ bpl 1f
-+ vld1.16 {d0[0]}, [r10], r9
-+ vld1.16 {d0[1]}, [r3], r9
-+ vld1.16 {d0[2]}, [r10], r9
-+ vld1.16 {d0[3]}, [r3], r9
-+ vld1.16 {d1[0]}, [r10], r9
-+ vld1.16 {d1[1]}, [r3], r9
-+ vld1.16 {d1[2]}, [r10]
-+ vld1.16 {d1[3]}, [r3]
-+1:
-+ bcc 1f
-+ ldr r12, [sp, #dl_size]
-+ vld1.16 {d2[1]}, [r4], r9
-+ cmp r12, #p_size
-+ vld1.16 {d2[2]}, [r8], r9
-+ vld1.16 {d2[3]}, [r4], r9
-+ blt 2f
-+ vld1.16 {d3[0]}, [r8], r9
-+ vld1.16 {d3[1]}, [r4], r9
-+ vld1.16 {d3[2]}, [r8]
-+ vld1.16 {d3[3]}, [r4]
-+ b 1f
-+2:
-+ vdup.16 d3, d2[3]
-+1:
-+ tst r2, #FILTER_LIGHT
-+ add r12, r0, #-pw
-+ beq 10f
-+
-+ @ Luma light filter
-+ vext.16 q9, q2, q3, #7
-+ vext.16 q8, q15, q2, #7
-+ vext.16 q13, q0, q1, #7
-+ vext.16 q12, q15, q0, #7
-+ vadd.u16 q9, q3
-+ vadd.u16 q8, q2
-+ vadd.u16 q13, q1
-+ vadd.u16 q12, q0
-+ vmov.u16 r3, d7[3] @ Save final pel
-+ vmov.u16 r2, d3[3] @ Save final pel
-+
-+ vext.16 q2, q8, q9, #1
-+ vext.16 q3, q9, q9, #1
-+ vext.16 q0, q12, q13, #1
-+ vext.16 q1, q13, q13, #1
-+ vadd.u16 d30, d16, d24 @ d30[0] = l[0] + 2ul + u[0]
-+ vadd.u16 q2, q8
-+ vadd.u16 q3, q9
-+ vadd.u16 q0, q12
-+ vadd.u16 q1, q13
-+
-+ vrshr.u16 q2, #2
-+ vrshr.u16 q3, #2
-+ vrshr.u16 q0, #2
-+ vrshr.u16 q1, #2
-+ vrshr.u16 d30, #2
-+ vmov.u16 d7[3], r3 @ Restore final pel
-+ vmov.u16 d3[3], r2 @ Restore final pel
-+ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
-+
-+10:
-+ vst1.16 {q2, q3}, [r1] @ Up
-+ vst1.16 {d31[3]}, [r12] @ Up-left
-+ vst1.16 {q0, q1}, [r0] @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_16(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set ur_size, sp_base + 16
-+.set dl_size, sp_base + 20
-+.set pw_s, 1
-+.set pw, (1 << pw_s)
-+.set log2_s, 4
-+.set p_size, (1 << log2_s) @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_16, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 16, "d0[],d1[]", d31[3], "d16[],d17[]", "d20[],d21[]"
-+
-+ vdup.16 q9, d16[0]
-+ vdup.16 q11, d20[0]
-+
-+ it cs
-+ vldmcs r6, {d16-d19}
-+ ldr r12, [sp, #ur_size]
-+ bpl 1f
-+ cmp r12, #12
-+ @ Given chroma frame layout, if UR exists then it is always legit to
-+ @ load all of it even if most of it is outside the frame.
-+ vldm r5, {d20-d23}
-+ bgt 1f
-+ bge 4f
-+ cmp r12, #8
-+ bge 3f
-+ vdup.16 d21, d20[3]
-+3: vdup.16 d22, d21[3]
-+4: vdup.16 d23, d22[3]
-+
-+1:
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ ldr r12, [sp, #dl_size]
-+ vdup.16 q1, d0[0]
-+ vdup.16 q2, d0[0]
-+ vdup.16 q3, d0[0]
-+ bpl 1f
-+ vld1.16 {d0[0]}, [r10], r9
-+ vld1.16 {d0[1]}, [r3], r9
-+ vld1.16 {d0[2]}, [r10], r9
-+ vld1.16 {d0[3]}, [r3], r9
-+ vld1.16 {d1[0]}, [r10], r9
-+ vld1.16 {d1[1]}, [r3], r9
-+ vld1.16 {d1[2]}, [r10], r9
-+ vld1.16 {d1[3]}, [r3], r9
-+ vld1.16 {d2[0]}, [r10], r9
-+ vld1.16 {d2[1]}, [r3], r9
-+ vld1.16 {d2[2]}, [r10], r9
-+ vld1.16 {d2[3]}, [r3], r9
-+ vld1.16 {d3[0]}, [r10], r9
-+ vld1.16 {d3[1]}, [r3], r9
-+ vld1.16 {d3[2]}, [r10]
-+ vld1.16 {d3[3]}, [r3]
-+1:
-+ bcc 1f
-+ vld1.16 {d4[1]}, [r4], r9
-+ cmp r12, #4
-+ vld1.16 {d4[2]}, [r8], r9
-+ vld1.16 {d4[3]}, [r4], r9
-+ ble 2f
-+ vld1.16 {d5[0]}, [r8], r9
-+ vld1.16 {d5[1]}, [r4], r9
-+ cmp r12, #12
-+ vld1.16 {d5[2]}, [r8], r9
-+ vld1.16 {d5[3]}, [r4], r9
-+ blt 3f
-+ vld1.16 {d6[0]}, [r8], r9
-+ vld1.16 {d6[1]}, [r4], r9
-+ vld1.16 {d6[2]}, [r8], r9
-+ vld1.16 {d6[3]}, [r4], r9
-+ ble 4f
-+ vld1.16 {d7[0]}, [r8], r9
-+ vld1.16 {d7[1]}, [r4], r9
-+ vld1.16 {d7[2]}, [r8]
-+ vld1.16 {d7[3]}, [r4]
-+ b 1f
-+2: vdup.16 d5, d4[3]
-+3: vdup.16 d6, d5[3]
-+4: vdup.16 d7, d6[3]
-+1:
-+ tst r2, #FILTER_LIGHT
-+ add r12, r0, #-pw
-+ beq 10f
-+
-+ vpush {q5}
-+ @ Luma light filter
-+ @ Left
-+ vext.16 q5, q2, q3, #7
-+ vext.16 q14, q1, q2, #7
-+ vext.16 q13, q0, q1, #7
-+ vext.16 q12, q15, q0, #7
-+
-+ vadd.u16 q5, q3
-+ vadd.u16 q14, q2
-+ vadd.u16 q13, q1
-+ vadd.u16 q12, q0
-+ vmov.u16 r2, d7[3] @ Save final pel
-+
-+ vext.16 q0, q12, q13, #1
-+ vext.16 q1, q13, q14, #1
-+ vext.16 q2, q14, q5, #1
-+ vext.16 q3, q5, q5, #1
-+
-+ vmov d30, d24 @ d30[0] = l[0] + ul
-+ vadd.u16 q0, q12
-+ vadd.u16 q1, q13
-+ vadd.u16 q2, q14
-+ vadd.u16 q3, q5
-+
-+ vrshr.u16 q0, #2
-+ vrshr.u16 q1, #2
-+ vrshr.u16 q2, #2
-+ vrshr.u16 q3, #2
-+
-+ @ Up
-+ vext.16 q5, q10, q11, #7
-+ vext.16 q14, q9, q10, #7
-+ vext.16 q13, q8, q9, #7
-+ vext.16 q12, q15, q8, #7
-+
-+ vadd.u16 q5, q11
-+ vadd.u16 q14, q10
-+ vadd.u16 q13, q9
-+ vadd.u16 q12, q8
-+ vmov.u16 r3, d23[3] @ Save final pel
-+
-+ vext.16 q8, q12, q13, #1
-+ vext.16 q9, q13, q14, #1
-+ vext.16 q10, q14, q5, #1
-+ vext.16 q11, q5, q5, #1
-+
-+ vadd.u16 d30, d24 @ d30[0] = l[0] + 2ul + u[0]
-+ vadd.u16 q8, q12
-+ vadd.u16 q9, q13
-+ vadd.u16 q10, q14
-+ vadd.u16 q11, q5
-+
-+ vrshr.u16 q8, #2
-+ vrshr.u16 q9, #2
-+ vrshr.u16 q10, #2
-+ vrshr.u16 q11, #2
-+
-+ @ Misc
-+ vrshr.u16 d30, #2
-+ vmov.u16 d7[3], r2 @ Restore final pel
-+ vmov.u16 d23[3], r3 @ Restore final pel
-+ vdup.u16 d31, d30[0] @ d31[3] = d30[0]
-+ vpop {q5}
-+
-+10:
-+ vstm r1, {d16-d23} @ Up
-+ vst1.16 {d31[3]}, [r12] @ Up-left
-+ vstm r0, { d0-d7 } @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+@ int ff_hevc_rpi_intra_filter_4_neon_32(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set pw_s, 2
-+.set pw, (1 << pw_s)
-+.set log2_s, 2
-+
-+function ff_hevc_rpi_intra_filter_4_neon_32, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d4[],d5[]", "d6[],d7[]"
-+
-+ it cs
-+ vldmcs r6, {d4, d5}
-+ it mi
-+ vldmmi r5, {d6, d7}
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ vdup.32 q1, d0[0]
-+ add r12, r0, #-pw
-+ bpl 1f
-+ vld1.32 {d0[0]}, [r10], r9
-+ vld1.32 {d0[1]}, [r3], r9
-+ vld1.32 {d1[0]}, [r10]
-+ vld1.32 {d1[1]}, [r3]
-+1:
-+ bcc 1f
-+ vld1.32 {d2[1]}, [r4], r9
-+ vld1.32 {d3[0]}, [r8]
-+ vld1.32 {d3[1]}, [r4]
-+1:
-+ vst1.32 {q2, q3 }, [r1] @ Up
-+ vst1.32 {d31[1]}, [r12]
-+ vst1.32 {q0, q1 }, [r0] @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_8_neon_32(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set ur_size, sp_base + 16
-+.set dl_size, sp_base + 20
-+.set pw_s, 2
-+.set pw, (1 << pw_s)
-+.set log2_s, 3
-+.set p_size, (1 << log2_s) @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_8_neon_32, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 32, "d0[],d1[]", d31[1], "d16[],d17[]", "d20[],d21[]"
-+
-+ vdup.32 q9, d16[0]
-+ vdup.32 q11, d20[0]
-+
-+ it cs
-+ vldmcs r6, {q8, q9 }
-+ ldr r12, [sp, #ur_size]
-+ bpl 1f
-+ cmp r12, #p_size
-+ vldm r5, {q10, q11}
-+ bge 1f
-+ vdup.32 q11, d21[1]
-+1:
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ vdup.32 q1, d0[0]
-+ vdup.32 q2, d0[0]
-+ vdup.32 q3, d0[0]
-+ bpl 1f
-+ vld1.32 {d0[0]}, [r10], r9
-+ vld1.32 {d0[1]}, [r3], r9
-+ vld1.32 {d1[0]}, [r10], r9
-+ vld1.32 {d1[1]}, [r3], r9
-+ vld1.32 {d2[0]}, [r10], r9
-+ vld1.32 {d2[1]}, [r3], r9
-+ vld1.32 {d3[0]}, [r10]
-+ vld1.32 {d3[1]}, [r3]
-+1:
-+ bcc 1f
-+ ldr r12, [sp, #dl_size]
-+ vld1.32 {d4[1]}, [r4], r9
-+ cmp r12, #p_size
-+ vld1.32 {d5[0]}, [r8], r9
-+ vld1.32 {d5[1]}, [r4], r9
-+ blt 2f
-+ vld1.32 {d6[0]}, [r8], r9
-+ vld1.32 {d6[1]}, [r4], r9
-+ vld1.32 {d7[0]}, [r8]
-+ vld1.32 {d7[1]}, [r4]
-+ b 1f
-+2:
-+ vdup.32 q3, d5[1]
-+1:
-+ add r12, r0, #-pw
-+ vstm r1, { q8-q11} @ Up
-+ vst1.32 {d31[1]}, [r12]
-+ vstm r0, { q0-q3 } @ Left
-+ pop {r4-r10, pc}
-+endfunc
-+
-+
-+@ int ff_hevc_rpi_intra_filter_16_neon_32(
-+@ pixel * const left, [r0]
-+@ pixel * const top, [r1]
-+@ const unsigned int req, [r2]
-+@ const unsigned int avail, [r3]
-+@ const pixel * const src_l, [sp, #0]
-+@ const pixel * const src_u, [sp, #4]
-+@ const pixel * const src_ur, [sp, #8]
-+@ const unsigned int stride, [sp, #12] (pels)
-+@ const unsigned int top_right_size, [sp, #16]
-+@ const unsigned int down_left_size) [sp, #20]
-+
-+.set sp_base, 8*4
-+.set ur_size, sp_base + 16
-+.set dl_size, sp_base + 20
-+.set pw_s, 2
-+.set pw, (1 << pw_s)
-+.set log2_s, 4
-+.set p_size, (1 << log2_s) @ size in pels
-+
-+function ff_hevc_rpi_intra_filter_16_neon_32, export=1
-+ push {r4-r10, lr}
-+ load_pointers pw_s, log2_s, sp_base, 32, d30[0], d30[1], d31[0], d31[1]
-+
-+ @ Once we get this big we have run out of neon regs to store
-+ @ everything at once so do in pieces
-+
-+ @ Up (have)
-+ it cs
-+ vldmcs r6, { q0-q3 }
-+ ldr r12, [sp, #ur_size]
-+ it mi
-+ vldmmi r5, { q8-q11}
-+ it cs
-+ vstmcs r1, { q0-q3 }
-+ bpl 1f
-+ cmp r12, #12
-+ add lr, r1, #(pw << log2_s)
-+ bgt 2f
-+ cmp r12, #8
-+ bge 3f
-+ vdup.16 q9, d17[1]
-+4: vdup.16 d10, d19[1]
-+3: vdup.16 q11, d21[1]
-+2: vstm lr, { q8-q11}
-+1:
-+
-+ @ Left (have)
-+ add lr, r0, #-pw
-+ lsls r12, r7, #AVAIL_S_L_N_DL_C
-+ vst1.32 {d30[1]}, [lr] @ UL
-+ bpl 1f
-+ vld1.32 { d0[0]}, [r10], r9
-+ vld1.32 { d0[1]}, [r3], r9
-+ vld1.32 { d1[0]}, [r10], r9
-+ vld1.32 { d1[1]}, [r3], r9
-+ vld1.32 { d2[0]}, [r10], r9
-+ vld1.32 { d2[1]}, [r3], r9
-+ vld1.32 { d3[0]}, [r10], r9
-+ vld1.32 { d3[1]}, [r3], r9
-+ vld1.32 { d4[0]}, [r10], r9
-+ vld1.32 { d4[1]}, [r3], r9
-+ vld1.32 { d5[0]}, [r10], r9
-+ vld1.32 { d5[1]}, [r3], r9
-+ vld1.32 { d6[0]}, [r10], r9
-+ vld1.32 { d6[1]}, [r3], r9
-+ vld1.32 { d7[0]}, [r10]
-+ vld1.32 { d7[1]}, [r3]
-+ vstm r0, { q0-q3 }
-+1:
-+ bcc 1f
-+ ldr r12, [sp, #dl_size]
-+ vdup.32 d16, d30[0] @ d16[0] = d30[0]
-+ add lr, r0, #(pw << log2_s)
-+ vld1.32 {d16[1]}, [r4], r9
-+ cmp r12, #4
-+ vld1.32 {d17[0]}, [r8], r9
-+ vld1.32 {d17[1]}, [r4], r9
-+ ble 2f
-+ vld1.32 {d18[0]}, [r8], r9
-+ vld1.32 {d18[1]}, [r4], r9
-+ cmp r12, #12
-+ vld1.32 {d19[0]}, [r8], r9
-+ vld1.32 {d19[1]}, [r4], r9
-+ blt 3f
-+ vld1.32 {d20[0]}, [r8], r9
-+ vld1.32 {d20[1]}, [r4], r9
-+ vld1.32 {d21[0]}, [r8], r9
-+ vld1.32 {d21[1]}, [r4], r9
-+ ble 4f
-+ vld1.32 {d22[0]}, [r8], r9
-+ vld1.32 {d22[1]}, [r4], r9
-+ vld1.32 {d23[0]}, [r8]
-+ vld1.32 {d23[1]}, [r4]
-+ b 5f
-+2: vdup.32 q9, d17[1]
-+3: vdup.32 q10, d19[1]
-+4: vdup.32 q11, d21[1]
-+5: vstm lr, { q8-q11}
-+1:
-+ eors r7, r2
-+ beq 99f
-+
-+ lsls r12, r7, #AVAIL_S_UR_N_U_C
-+ vdup.32 q0, d31[0]
-+ vdup.32 q1, d31[0]
-+ vdup.32 q2, d31[0]
-+ vdup.32 q3, d31[0]
-+ add lr, r1, #(pw << log2_s)
-+ vdup.32 q8, d31[1]
-+ vdup.32 q9, d31[1]
-+ vdup.32 q10, d31[1]
-+ vdup.32 q11, d31[1]
-+ it cs
-+ vstmcs r1, { q0-q3 }
-+ it mi
-+ vstmmi lr, { q8-q11}
-+
-+ lsls r7, #AVAIL_S_L_N_DL_C
-+ vdup.32 q0, d30[0]
-+ vdup.32 q1, d30[0]
-+ vdup.32 q2, d30[0]
-+ vdup.32 q3, d30[0]
-+ add lr, r0, #(pw << log2_s)
-+ it mi
-+ vstmmi r0, { q0-q3 }
-+ it cs
-+ vstmcs lr, { q0-q3 }
-+
-+99:
-+ pop {r4-r10, pc}
-+endfunc
-+
-+
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-new file mode 100644
-index 0000000000..67192e7213
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_hv_neon.S
-@@ -0,0 +1,911 @@
-+/*
-+ * Copyright (c) 2018 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/*
-+ * Horizontal & Vertical special cases of angular intra pred
-+ *
-+ * Split out because:
-+ * Vertical, at least, is relatively common
-+ * Much simpler code than the general angular case
-+ * Luma with size < 32 has extra filtering that doesn't happen anywhere else
-+ *
-+ * *** Currently luma filtering is mandatory where it occurs, but there are
-+ * cases where it should be turned off (rdpcm & an extension sps flag).
-+ * These don't occur in the standard conformance suite for Main Profile
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_8, export=1
-+ ldrb ip, [r2, #-1] @ Top-left
-+ vld1.32 {d0[0]}, [r2 :32] @ Left
-+ add r2, r0, r3
-+ vld1.8 {d1[]}, [r1]
-+ lsl r3, #1
-+ vdup.8 d4, ip
-+ vmov.i8 d2, #128
-+ vhsub.u8 d4, d0, d4
-+ veor d1, d2
-+ vld1.32 {d0[0]}, [r1 :32] @ Top
-+ vqadd.s8 d1, d4
-+ vmov.i64 d3, #0xff
-+ vmov d4, d0
-+ veor d5, d1, d2
-+ veor d1, d1, d2
-+ vbit d0, d1, d3
-+ vshr.u64 d5, #8
-+ vst1.32 {d0[0]}, [r0], r3
-+ vshr.u64 d1, #16
-+ vbit d4, d5, d3
-+ vshr.u64 d5, #16
-+ vst1.32 {d4[0]}, [r2], r3
-+ vbit d0, d1, d3
-+ vst1.32 {d0[0]}, [r0]
-+ vbit d4, d5, d3
-+ vst1.32 {d4[0]}, [r2]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_8, export=1
-+ ldrb ip, [r2, #-1] @ Top-left
-+ vld1.8 {d0}, [r2 :64] @ Left
-+ vmov.i8 d1, #128
-+ vld1.8 {d2[]}, [r1]
-+ vld1.8 {d3}, [r1 :64] @ Top
-+ vdup.8 d4, ip
-+ vhsub.u8 d4, d0, d4
-+ veor d2, d1
-+ vmov.i64 d0, #0xff
-+ mov r1, #8
-+ vqadd.s8 d2, d4, d2
-+ veor d1, d2, d1
-+1:
-+ vbit d3, d1, d0
-+ vshr.u64 d1, #8
-+ vst1.8 {d3}, [r0 :64], r3
-+ subs r1, #2
-+ vbit d3, d1, d0
-+ vshr.u64 d1, #8
-+ vst1.8 {d3}, [r0 :64], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_8, export=1
-+ ldrb ip, [r2, #-1] @ Top-left
-+ vld1.8 {q0}, [r2 :128] @ Left
-+ vdup.8 q1, ip
-+ vld1.8 {d4[],d5[]}, [r1]
-+ vhsub.u8 q0, q1
-+ vmov.i8 q1, #128
-+ veor q2, q1
-+ vmov.i64 d16, #0xff
-+ vqadd.s8 q0, q2
-+ vld1.8 {q3}, [r1 :128] @ Top
-+ mov r1, #16
-+ veor q0, q1
-+ vmov q1, q3
-+ vext.8 q2, q0, q0, #1
-+1:
-+ vbit d2, d0, d16
-+ vbit d6, d4, d16
-+ vext.8 q0, q0, q0, #2
-+ subs r1, #2
-+ vst1.8 {q1}, [r0 :128], r3
-+ vext.8 q2, q2, q2, #2
-+ vst1.8 {q3}, [r0 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vert_32_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_8, export=1
-+ vld1.8 {q0, q1 }, [r1 :128] @ Up
-+ add r2, r0, r3
-+ lsl r3, #1
-+ mov r1, #16
-+1:
-+ vst1.8 {q0, q1 }, [r0 :128], r3
-+ subs r1, #1
-+ vst1.8 {q0, q1 }, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_8, export=1
-+ vld1.16 {d0 }, [r1 :64] @ Up
-+ add r2, r0, r3, lsl #1
-+ lsl r3, #2
-+
-+ vst1.16 {d0 }, [r0 :64], r3
-+ vst1.16 {d0 }, [r2 :64], r3
-+ vst1.16 {d0 }, [r0 :64]
-+ vst1.16 {d0 }, [r2 :64]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_8, export=1
-+ vld1.16 {q0 }, [r1 :128] @ Up
-+ add r2, r0, r3, lsl #1
-+ lsl r3, #2
-+ mov r1, #4
-+1:
-+ vst1.16 {q0 }, [r0 :128], r3
-+ subs r1, #2
-+ vst1.16 {q0 }, [r2 :128], r3
-+ vst1.16 {q0 }, [r0 :128], r3
-+ vst1.16 {q0 }, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_8, export=1
-+ vld1.16 {q0, q1 }, [r1 :128] @ Up
-+ add r2, r0, r3, lsl #1
-+ lsl r3, #2
-+ mov r1, #8
-+1:
-+ vst1.16 {q0, q1 }, [r0 :128], r3
-+ subs r1, #1
-+ vst1.16 {q0, q1 }, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+@ ? Might be faster as simple arm
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_8, export=1
-+ ldrb ip, [r2, #-1] @ Top-left
-+ vld1.32 {d0[0]}, [r1 :32] @ Top
-+ add r1, r2, #3
-+ vld1.8 {d1[]}, [r2]!
-+ vdup.8 d2, ip
-+ vmov.i8 d3, #128
-+ vhsub.u8 d0, d2
-+ veor d1, d3
-+ vld1.8 {d2[]}, [r2]!
-+ add ip, r0, r3
-+ vqadd.s8 d0, d0, d1
-+ lsl r3, #1
-+ vld1.8 {d1[]}, [r2]
-+ vld1.8 {d4[]}, [r1]
-+ veor d0, d3
-+ vst1.32 {d0[0]}, [r0 :32], r3
-+ vst1.32 {d2[0]}, [ip :32], r3
-+ vst1.32 {d1[0]}, [r0 :32]
-+ vst1.32 {d4[0]}, [ip :32]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_8, export=1
-+ ldrb ip, [r2, #-1] @ Top-left
-+ vld1.8 {d0}, [r1 :64] @ Top
-+ vmov.i8 d1, #128
-+ vld1.8 {d2[]}, [r2]!
-+ mov r1, #8-2
-+ vdup.8 d3, ip
-+ vhsub.u8 d0, d3
-+ veor d2, d1
-+ vqadd.s8 d0, d2
-+ vld1.8 {d2[]}, [r2]!
-+ veor d0, d1
-+ vst1.8 {d0}, [r0], r3
-+1:
-+ vld1.8 {d0[]}, [r2]!
-+ subs r1, #2
-+ vst1.8 {d2}, [r0 :64], r3
-+ vld1.8 {d2[]}, [r2]!
-+ vst1.8 {d0}, [r0 :64], r3
-+ bne 1b
-+
-+ vst1.8 {d2}, [r0 :64]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_8, export=1
-+ ldrb ip, [r2, #-1] @ Top-left
-+ vld1.8 {q0}, [r1 :64] @ Top
-+ mov r1, #16-2
-+ vld1.8 {d4[],d5[]}, [r2]!
-+ vdup.8 q3, ip
-+ vhsub.u8 q0, q3
-+ vmov.i8 q1, #128
-+ veor q2, q1
-+ vqadd.s8 q0, q2
-+ vld1.8 {d4[],d5[]}, [r2]!
-+ veor q0, q1
-+ vst1.8 {q0}, [r0], r3
-+1:
-+ vld1.8 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.8 {q2}, [r0 :64], r3
-+ vld1.8 {d4[],d5[]}, [r2]!
-+ vst1.8 {q0}, [r0 :64], r3
-+ bne 1b
-+
-+ vst1.8 {q2}, [r0 :64]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_8, export=1
-+ vld1.8 {d0[],d1[]}, [r2]!
-+ add ip, r0, #16
-+ mov r1, #32-2
-+ vld1.8 {d2[],d3[]}, [r2]!
-+ vst1.8 {q0}, [r0 :128], r3
-+ vst1.8 {q0}, [ip :128], r3
-+1:
-+ vld1.8 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.8 {q1}, [r0 :128], r3
-+ vst1.8 {q1}, [ip :128], r3
-+ vld1.8 {d2[],d3[]}, [r2]!
-+ vst1.8 {q0}, [r0 :128], r3
-+ vst1.8 {q0}, [ip :128], r3
-+ bne 1b
-+
-+ vst1.8 {q1}, [r0 :128]
-+ vst1.8 {q1}, [ip :128]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_8, export=1
-+ add r1, r2, #2
-+ vld1.16 {d0[]}, [r2]
-+ add r2, #4
-+ vld1.16 {d1[]}, [r1]
-+ add r1, #4
-+ vld1.16 {d2[]}, [r2]
-+A add r2, r0, r3, lsl #1
-+T lsl r3, #1
-+T add r2, r0, r3
-+ vld1.16 {d3[]}, [r1]
-+A lsl r3, #2
-+T lsl r3, #1
-+ vst1.16 {d0}, [r0 :64], r3
-+ vst1.16 {d1}, [r2 :64], r3
-+ vst1.16 {d2}, [r0 :64]
-+ vst1.16 {d3}, [r2 :64]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_8, export=1
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ lsl r3, #1
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ mov r1, #8-2
-+ vst1.16 {q0}, [r0 :64], r3
-+1:
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.16 {q1}, [r0 :64], r3
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ vst1.16 {q0}, [r0 :64], r3
-+ bne 1b
-+
-+ vst1.16 {q1}, [r0 :64]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_8, export=1
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ lsl r3, #1
-+ add ip, r0, #16
-+ mov r1, #16-2
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ vst1.16 {q0}, [r0 :128], r3
-+ vst1.16 {q0}, [ip :128], r3
-+1:
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.16 {q1}, [r0 :128], r3
-+ vst1.16 {q1}, [ip :128], r3
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ vst1.16 {q0}, [r0 :128], r3
-+ vst1.16 {q0}, [ip :128], r3
-+ bne 1b
-+
-+ vst1.16 {q1}, [r0 :128]
-+ vst1.16 {q1}, [ip :128]
-+ bx lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ 10 Bit
-+@ Has clipping constants so 10-bit only but could easily be macroed up to
-+@ 14-bit before we run out of bits
-+
-+
-+@ ff_hevc_rpi_pred_vertical_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_4_neon_10, export=1
-+ ldrh ip, [r2, #-2] @ Top-left
-+ vld1.16 {d0}, [r2 :64] @ Left
-+ vmov.i16 d2, #0
-+ vld1.16 {d1[]}, [r1]
-+T lsl r3, #1
-+ vdup.16 d4, ip
-+ vmov.i16 d3, #0x3ff
-+ vld1.16 {d5}, [r1 :64] @ Top
-+ vhsub.u16 d4, d0, d4
-+ vmov.i64 d0, #0xffff
-+A add r2, r0, r3, lsl #1
-+T add r2, r0, r3
-+ vadd.i16 d1, d1, d4
-+ vmov d6, d5
-+ vmax.s16 d1, d1, d2
-+ vmin.s16 d2, d1, d3
-+ vmin.s16 d1, d1, d3
-+ vbit d5, d1, d0
-+A lsl r3, #2
-+T lsl r3, #1
-+ vshr.u64 d2, #16
-+ vshr.u64 d1, #32
-+ vbit d6, d2, d0
-+ vst1.16 {d5}, [r0], r3
-+ vshr.u64 d2, #32
-+ vst1.16 {d6}, [r2], r3
-+ vbit d5, d1, d0
-+ vst1.16 {d5}, [r0]
-+ vbit d6, d2, d0
-+ vst1.16 {d6}, [r2]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_8_neon_10, export=1
-+ ldrh ip, [r2, #-2] @ Top-left
-+ vld1.16 {q0}, [r2 :128] @ Left
-+ lsl r3, #1
-+ vdup.16 q1, ip
-+ vld1.16 {d4[],d5[]}, [r1]
-+ vhsub.u16 q0, q0, q1
-+ vmov.i16 q1, #0
-+ vadd.i16 q0, q2
-+ vmov.i16 q2, #0x3ff
-+ vld1.16 {q3}, [r1 :128] @ Top
-+ mov r1, #8
-+ vmax.s16 q0, q1
-+ vmov q1, q3
-+ vmin.s16 q0, q2
-+ vmov.i64 d16, #0xffff
-+ vext.16 q2, q0, q0, #1
-+1:
-+ vbit d2, d0, d16
-+ vbit d6, d4, d16
-+ vext.16 q0, q0, q0, #2
-+ subs r1, #2
-+ vst1.16 {q1}, [r0 :128], r3
-+ vext.16 q2, q2, q2, #2
-+ vst1.16 {q3}, [r0 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_16_neon_10, export=1
-+ ldrh ip, [r2, #-2] @ Top-left
-+ vld1.16 {q0-q1}, [r2 :128] @ Left
-+T lsl r3, #1
-+ vdup.16 q2, ip
-+A add r2, r0, r3, lsl #1
-+T add r2, r0, r3
-+ vld1.16 {d6[],d7[]}, [r1]
-+A lsl r3, #2
-+T lsl r3, #1
-+ vhsub.u16 q0, q2
-+ vhsub.u16 q1, q2
-+ vadd.i16 q0, q3
-+ vadd.i16 q1, q3
-+ vmov.i16 q2, #0
-+ vld1.16 {q8-q9}, [r1 :128] @ Top
-+ mov r1, #0
-+ vmov.i16 q3, #0x3ff
-+ vmax.s16 q0, q2
-+ vmax.s16 q1, q2
-+ vmin.s16 q0, q3
-+ vmin.s16 q1, q3
-+ vmov q10, q8
-+ vmov q11, q9
-+ vext.16 q2, q0, q1, #1
-+ vext.16 q3, q1, q1, #1
-+ vmov.i64 d24, #0xffff
-+1:
-+ vbit d16, d0, d24
-+ vbit d20, d4, d24
-+ vext.16 q0, q0, q0, #2
-+ subs r1, #1<<30
-+ vst1.16 {q8-q9}, [r0 :128], r3
-+ vext.16 q2, q2, q2, #2
-+ vst1.16 {q10-q11}, [r2 :128], r3
-+ bne 1b
-+1:
-+ vbit d16, d2, d24
-+ vbit d20, d6, d24
-+ vext.16 q1, q1, q1, #2
-+ subs r1, #1<<30
-+ vst1.16 {q8-q9}, [r0 :128], r3
-+ vext.16 q3, q3, q3, #2
-+ vst1.16 {q10-q11}, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_32_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_32_neon_10, export=1
-+ vldm r1, { q0-q3 } @ Up
-+ lsl r3, #1
-+ mov r1, #32
-+ add r2, r0, #32
-+1:
-+ vst1.16 {q0-q1}, [r0 :128], r3
-+ subs r1, #1
-+ vst1.16 {q2-q3}, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_4_neon_10, export=1
-+ vld1.16 {q0 }, [r1 :128] @ Up
-+ add r2, r0, r3, lsl #2
-+ lsl r3, #3
-+
-+ vst1.16 {q0 }, [r0 :128], r3
-+ vst1.16 {q0 }, [r2 :128], r3
-+ vst1.16 {q0 }, [r0 :128]
-+ vst1.16 {q0 }, [r2 :128]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_8_neon_10, export=1
-+ vld1.16 {q0, q1 }, [r1 :128] @ Up
-+ add r2, r0, r3, lsl #2
-+ lsl r3, #3
-+ mov r1, #4
-+1:
-+ vst1.16 {q0, q1 }, [r0 :128], r3
-+ subs r1, #1
-+ vst1.16 {q0, q1 }, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_vertical_c_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_vertical_c_16_neon_10, export=1
-+ vldm r1, { q0-q3 } @ Up
-+ lsl r3, #2
-+ mov r1, #16
-+ add r2, r0, #32
-+1:
-+ vst1.16 {q0-q1}, [r0 :128], r3
-+ subs r1, #1
-+ vst1.16 {q2-q3}, [r2 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+@ ff_hevc_rpi_pred_horizontal_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_4_neon_10, export=1
-+ ldrh ip, [r2, #-2] @ Top-left
-+ vld1.16 {d0}, [r1 :64] @ Top
-+ vmov.i16 d1, #0
-+ vld1.16 {d2[]}, [r2]!
-+T lsl r3, #1
-+ vdup.16 d3, ip
-+ vmov.i16 d4, #0x3ff
-+ vhsub.u16 d0, d3
-+A add ip, r0, r3, lsl #1
-+T add ip, r0, r3
-+ vld1.16 {d3[]}, [r2]!
-+A lsl r3, #2
-+T lsl r3, #1
-+ vadd.i16 d0, d2
-+ vld1.16 {d2[]}, [r2]!
-+ vmax.s16 d0, d1
-+ vld1.16 {d1[]}, [r2]
-+ vmin.s16 d0, d4
-+ vst1.16 {d0}, [r0 :64], r3
-+ vst1.16 {d3}, [ip :64], r3
-+ vst1.16 {d2}, [r0 :64]
-+ vst1.16 {d1}, [ip :64]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_8_neon_10, export=1
-+ ldrh ip, [r2, #-2] @ Top-left
-+ vld1.16 {q0}, [r1 :128] @ Top
-+ lsl r3, #1
-+ vdup.16 q1, ip
-+ mov r1, #8-2
-+ vhsub.u16 q0, q1
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ vmov.i16 q2, #0
-+ vadd.i16 q0, q1
-+ vmov.i16 q1, #0x3ff
-+ vmax.s16 q0, q2
-+ vld1.16 {d4[],d5[]}, [r2]!
-+ vmin.s16 q0, q1
-+ vst1.16 {q0}, [r0 :128], r3
-+1:
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.16 {q2}, [r0 :128], r3
-+ vld1.16 {d4[],d5[]}, [r2]!
-+ vst1.16 {q0}, [r0 :128], r3
-+ bne 1b
-+
-+ vst1.16 {q2}, [r0 :128]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontalal_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_16_neon_10, export=1
-+ ldrh ip, [r2, #-2] @ Top-left
-+ vld1.16 {q0-q1}, [r1 :128] @ Top
-+ lsl r3, #1
-+ vdup.16 q2, ip
-+ add ip, r0, r3
-+ vhsub.u16 q0, q2
-+ add ip, #16
-+ vhsub.u16 q1, q2
-+ mov r1, #16-2
-+ vld1.16 {d4[],d5[]}, [r2]!
-+ vmov.i16 q3, #0
-+ vadd.u16 q0, q2
-+ vadd.i16 q1, q2
-+ vmov.i16 q2, #0x3ff
-+ vmax.s16 q0, q3
-+ vmax.s16 q1, q3
-+ vld1.16 {d6[],d7[]}, [r2]!
-+ vmin.s16 q0, q2
-+ vmin.s16 q1, q2
-+ vst1.16 {q0-q1}, [r0 :128], r3
-+1:
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.16 {q3}, [r0 :128], r3
-+ vst1.16 {q3}, [ip :128], r3
-+ vld1.16 {d6[],d7[]}, [r2]!
-+ vst1.16 {q0}, [r0 :128], r3
-+ vst1.16 {q0}, [ip :128], r3
-+ bne 1b
-+
-+ vst1.16 {q3}, [r0 :128]
-+ vst1.16 {q3}, [ip :128]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_32_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_32_neon_10, export=1
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ add ip, r0, #16
-+ push {lr}
-+ mov lr, #32
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ lsl r3, #1
-+ vst1.16 {q0}, [r0 :128], lr
-+ sub r3, #32
-+ vst1.16 {q0}, [ip :128], lr
-+ mov r1, #32-2
-+ vst1.16 {q0}, [r0 :128], r3
-+ vst1.16 {q0}, [ip :128], r3
-+1:
-+ vld1.16 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.16 {q1}, [r0 :128], lr
-+ vst1.16 {q1}, [ip :128], lr
-+ vst1.16 {q1}, [r0 :128], r3
-+ vst1.16 {q1}, [ip :128], r3
-+ vld1.16 {d2[],d3[]}, [r2]!
-+ vst1.16 {q0}, [r0 :128], lr
-+ vst1.16 {q0}, [ip :128], lr
-+ vst1.16 {q0}, [r0 :128], r3
-+ vst1.16 {q0}, [ip :128], r3
-+ bne 1b
-+
-+ vst1.16 {q1}, [r0 :128], lr
-+ vst1.16 {q1}, [ip :128], lr
-+ vst1.16 {q1}, [r0 :128]
-+ vst1.16 {q1}, [ip :128]
-+ pop {pc}
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_4_neon_10, export=1
-+ add r1, r2, #4
-+ vld1.32 {d0[],d1[]}, [r2]
-+ add r2, #8
-+ vld1.32 {d2[],d3[]}, [r1]
-+ add r1, #8
-+ vld1.32 {d4[],d5[]}, [r2]
-+A add r2, r0, r3, lsl #2
-+T lsl r3, #2
-+T add r2, r0, r3
-+ vld1.32 {d6[],d7[]}, [r1]
-+A lsl r3, #3
-+T lsl r3, #1
-+ vst1.32 {q0}, [r0 :128], r3
-+ vst1.32 {q1}, [r2 :128], r3
-+ vst1.32 {q2}, [r0 :128]
-+ vst1.32 {q3}, [r2 :128]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_8_neon_10, export=1
-+ vld1.32 {d0[],d1[]}, [r2]!
-+ lsl r3, #2
-+ add ip, r0, #16
-+ mov r1, #8-2
-+ vld1.32 {d2[],d3[]}, [r2]!
-+ vst1.32 {q0}, [r0 :128], r3
-+ vst1.32 {q0}, [ip :128], r3
-+1:
-+ vld1.32 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.32 {q1}, [r0 :128], r3
-+ vst1.32 {q1}, [ip :128], r3
-+ vld1.32 {d2[],d3[]}, [r2]!
-+ vst1.32 {q0}, [r0 :128], r3
-+ vst1.32 {q0}, [ip :128], r3
-+ bne 1b
-+
-+ vst1.32 {q1}, [r0 :128]
-+ vst1.32 {q1}, [ip :128]
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_horizontal_c_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_horizontal_c_16_neon_10, export=1
-+ vld1.32 {d0[],d1[]}, [r2]!
-+ add ip, r0, #16
-+ push {lr}
-+ mov lr, #32
-+ vld1.32 {d2[],d3[]}, [r2]!
-+ lsl r3, #2
-+ vst1.32 {q0}, [r0 :128], lr
-+ sub r3, #32
-+ vst1.32 {q0}, [ip :128], lr
-+ mov r1, #16-2
-+ vst1.32 {q0}, [r0 :128], r3
-+ vst1.32 {q0}, [ip :128], r3
-+1:
-+ vld1.32 {d0[],d1[]}, [r2]!
-+ subs r1, #2
-+ vst1.32 {q1}, [r0 :128], lr
-+ vst1.32 {q1}, [ip :128], lr
-+ vst1.32 {q1}, [r0 :128], r3
-+ vst1.32 {q1}, [ip :128], r3
-+ vld1.32 {d2[],d3[]}, [r2]!
-+ vst1.32 {q0}, [r0 :128], lr
-+ vst1.32 {q0}, [ip :128], lr
-+ vst1.32 {q0}, [r0 :128], r3
-+ vst1.32 {q0}, [ip :128], r3
-+ bne 1b
-+
-+ vst1.32 {q1}, [r0 :128], lr
-+ vst1.32 {q1}, [ip :128], lr
-+ vst1.32 {q1}, [r0 :128]
-+ vst1.32 {q1}, [ip :128]
-+ pop {pc}
-+endfunc
-+
-+
-+
-diff --git a/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-new file mode 100644
-index 0000000000..e35896a102
---- /dev/null
-+++ b/libavcodec/arm/rpi_hevcpred_intra_planar_neon.S
-@@ -0,0 +1,1034 @@
-+/*
-+ * Copyright (c) 2017 John Cox <jc@kynesim.co.uk> (for Raspberry Pi)
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/arm/asm.S"
-+#include "neon.S"
-+
-+@ Planar intra pred (8.4.4.2.4)
-+@
-+@ predSamples[ x ][ y ] =
-+@ ( ( nTbS - 1 - x ) * p[ -1 ][ y ] +
-+@ ( x + 1 ) * p[ nTbS ][ -1 ] +
-+@ ( nTbS - 1 - y ) * p[ x ][ -1 ] +
-+@ ( y + 1 ) * p[ -1 ][ nTbS ] + nTbS ) >> ( Log2( nTbS ) + 1 )
-+
-+@ All 10-bit functions would work with 9
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_8, export=1
-+
-+ vld1.8 {d0}, [r1] @ Top
-+ adr ip, nb_3_0_1_4
-+ vld1.8 {d1}, [r2] @ Left
-+ vmov.i64 d2, #0xffffffff
-+ vldr d3, [ip, #8] @ {1,2,3,4,1,2,3,4}
-+ add r1, r0, r3
-+ vdup.32 d4, d0[0] @ {t0,t1,t2,t3,t0,t1,t2,t3}
-+ vdup.8 d0, d0[4] @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+ vdup.8 d5, d1[4] @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+ vdup.8 d6, d1[0] @ {l0,l0,l0,l0,l0,l0,l0,l0}
-+ vshll.u8 q8, d4, #2
-+ lsl r3, #1
-+ vsubl.u8 q2, d5, d4
-+ vmlal.u8 q8, d0, d3
-+ vld1.8 {d0}, [ip] @ {3,2,1,0,3,2,1,0}
-+ vdup.8 d7, d1[1] @ {l1,l1,l1,l1,l1,l1,l1,l1}
-+ vshl.s16 q9, q2, #1
-+ vbif d6, d7, d2 @ {l0,l0,l0,l0,l1,l1,l1,l1}
-+ vadd.i16 d16, d4
-+ vdup.8 d7, d1[2] @ {l2,l2,l2,l2,l2,l2,l2,l2}
-+ vadd.i16 d17, d18
-+ vdup.8 d1, d1[3] @ {l3,l3,l3,l3,l3,l3,l3,l3}
-+ vadd.i16 q2, q8, q9
-+ vmlal.u8 q8, d0, d6
-+ vbif d7, d1, d2 @ {l2,l2,l2,l2,l3,l3,l3,l3}
-+ vmlal.u8 q2, d0, d7
-+ vrshrn.i16 d0, q8, #3
-+ vst1.32 d0[0], [r0 :32], r3
-+ vst1.32 d0[1], [r1 :32], r3
-+ vrshrn.i16 d0, q2, #3
-+ vst1.32 d0[0], [r0 :32]
-+ vst1.32 d0[1], [r1 :32]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_4_neon_10, export=1
-+ @ Load from bytes & expand later - at the very least this uses less
-+ @ memory than having a short table
-+ vld1.16 {q0}, [r1 :64] @ Top
-+ adr ip, nbh_3_0_1_4
-+ vldr d2, [r2, #8] @ Left (lower)
-+ vldr d3, [ip, #8] @ {1,2,3,4}
-+T lsl r3, #1
-+ vshl.s16 d4, d0, #2
-+ vdup.16 d1, d1[0] @ {t4,t4,t4,t4}
-+ vldr d5, [r2] @ Left (upper)
-+ vdup.16 d2, d2[0] @ {l4,l4,l4,l4}
-+ vldr d6, [ip] @ {3,2,1,0}
-+ vmla.i16 d4, d3, d1 @ Acc set up
-+ vsub.i16 d0, d2, d0 @ Add set up
-+ vmov d7, d6
-+ vdup.16 d2, d5[0]
-+ vdup.16 d3, d5[1]
-+ vdup.16 d16, d5[2]
-+ vadd.i16 d18, d0, d4
-+ vshl.s16 d0, #1 @ x2
-+ vadd.i16 d19, d0, d4
-+ vdup.16 d17, d5[3]
-+ vadd.i16 d4, d0, d18
-+A add r1, r0, r3, lsl #1
-+T add r1, r0, r3
-+ vadd.i16 d5, d0, d19
-+A lsl r3, #2
-+T lsl r3, #1
-+ vmla.i16 q9, q1, q3
-+ vmla.i16 q2, q8, q3
-+ vrshr.u16 q0, q9, #3
-+ vst1.16 {d0}, [r0], r3
-+ vrshr.u16 d2, d4, #3
-+ vst1.16 {d1}, [r1], r3
-+ vrshr.u16 d3, d5, #3
-+ vst1.16 {d2}, [r0]
-+ vst1.16 {d3}, [r1]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_8, export=1
-+
-+ vld1.8 {q0}, [r1] @ Top
-+ adr ip, nb_7_0_1_8
-+ vldr d2, [r2, #8] @ Left (lower)
-+ mov r1, #8
-+ vldr d3, [ip, #8] @ {1,2,3,4,5,6,7,8}
-+ vshll.u8 q2, d0, #3
-+ vdup.8 d1, d1[0] @ {t8,t8,t8,t8,t8,t8,t8,t8}
-+ vdup.8 d2, d2[0] @ {l8,l8,l8,l8,l8,l8,l8,l8}
-+ vldr d6, [r2] @ Left (upper)
-+ vmlal.u8 q2, d3, d1
-+ vsubl.u8 q0, d2, d0
-+ vldr d7, [ip] @ {7,6,5,4,3,2,1,0}
-+
-+@ u8 7..0 [1] d7
-+@ u8 left[y] [1] d6
-+@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1]
-+
-+ vdup.8 d2, d6[0]
-+ vadd.i16 q2, q0
-+ vdup.8 d3, d6[1]
-+ vadd.i16 q8, q2, q0
-+1:
-+ vmlal.u8 q2, d7, d2
-+ subs r1, #2
-+ vadd.i16 q9, q8, q0
-+ vmlal.u8 q8, d7, d3
-+ vdup.8 d2, d6[2]
-+ vdup.8 d3, d6[3]
-+ vrshrn.i16 d20, q2, #4
-+ vshr.u64 d6, #16
-+ vmov q2, q9
-+ vst1.8 {d20}, [r0], r3
-+ vrshrn.i16 d20, q8, #4
-+ vadd.i16 q8, q2, q0
-+ vst1.8 {d20}, [r0], r3
-+ bne 1b
-+
-+ bx lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_8_neon_10, export=1
-+
-+ adr ip, nb_7_0_1_8
-+ vld1.16 {q0}, [r1 :128]! @ Top (left)
-+ lsl r3, #1
-+ vld1.16 {q1}, [ip :128] @ {7,6,5,4,3,2,1,0,1,2,3,4,5,6,7,8}
-+ add ip, r2, #16
-+ vld1.16 {d4[],d5[]}, [r1] @ Top (right)
-+ mov r1, #8-2
-+ vshl.s16 q3, q0, #3
-+ vmovl.u8 q8, d3 @ {1,2,3,4,5,6,7,8}
-+ vld1.16 {d18[],d19[]}, [ip] @ Left (lower)
-+ vmla.i16 q3, q8, q2 @ Acc set up
-+ vsub.i16 q0, q9, q0 @ Add set up
-+ vmovl.u8 q1, d2 @ {7,6,5,4,3,2,1,0}
-+ vadd.i16 q2, q3, q0
-+
-+@ u16 7..0 [1] q1
-+@ u32 left[y] [1] [r2]
-+@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1]
-+
-+ vld1.16 {d6[],d7[]}, [r2]!
-+ vadd.i16 q8, q2, q0
-+ vld1.16 {d18[],d19[]}, [r2]!
-+ vmla.i16 q2, q1, q3
-+ vadd.i16 q3, q8, q0
-+ vmla.i16 q8, q1, q9
-+1:
-+ vrshr.u16 q9, q2, #4
-+ subs r1, #2
-+ vmov q2, q3
-+ vrshr.u16 q10, q8, #4
-+ vld1.16 {d6[],d7[]}, [r2]!
-+ vst1.16 {q9}, [r0 :128], r3
-+ vadd.i16 q8, q2, q0
-+ vld1.16 {d18[],d19[]}, [r2]!
-+ vmla.i16 q2, q1, q3
-+ vadd.i16 q3, q8, q0
-+ vmla.i16 q8, q1, q9
-+ vst1.16 {q10}, [r0 :128], r3
-+ bne 1b
-+
-+ vrshr.u16 q9, q2, #4
-+ add r3, r0
-+ vrshr.u16 q10, q8, #4
-+ vst1.16 {q9}, [r0 :128]
-+ vst1.16 {q10}, [r3 :128]
-+
-+ bx lr
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+ .balign 64
-+
-+nb_31_0_1_32:
-+ .byte 31, 30, 29, 28, 27, 26, 25, 24
-+ .byte 23, 22, 21, 20, 19, 18, 17, 16
-+nb_15_0_1_16:
-+ .byte 15, 14, 13, 12, 11, 10, 9, 8
-+ .byte 7, 6, 5, 4, 3, 2, 1, 0
-+ .byte 1, 2, 3, 4, 5, 6, 7, 8
-+ .byte 9, 10, 11, 12, 13, 14, 15, 16
-+ .byte 17, 18, 19, 20, 21, 22, 23, 24
-+ .byte 25, 26, 27, 28, 29, 30, 31, 32
-+
-+ @ should be back on a 64-byte boundary here
-+
-+ @ These could be extracted from the above array, but separate out
-+ @ out for better (16 byte) alignment
-+nb_3_0_1_4:
-+ .byte 3, 2, 1, 0, 3, 2, 1, 0
-+ .byte 1, 2, 3, 4, 1, 2, 3, 4
-+nb_7_0_1_8:
-+ .byte 7, 6, 5, 4, 3, 2, 1, 0
-+ .byte 1, 2, 3, 4, 5, 6, 7, 8
-+nbh_3_0_1_4:
-+ .short 3, 2, 1, 0, 1, 2, 3, 4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_8, export=1
-+
-+ adr ip, nb_15_0_1_16 + 16
-+ vld1.8 {q0}, [r1 :128]! @ Top (left)
-+ add r2, #16
-+ vld1.8 {q1}, [ip: 128] @ {1,2,3...16}
-+ vld1.8 {d4[]}, [r1] @ Top (right)
-+ sub ip, #16
-+ vshll.u8 q3, d0, #4
-+ mov r1, #16
-+ vshll.u8 q8, d1, #4
-+ vld1.8 {d5[]}, [r2] @ Left (lower)
-+ sub r2, #16
-+ vmlal.u8 q3, d2, d4
-+ vmlal.u8 q8, d3, d4 @ Acc set up
-+ vsubl.u8 q1, d5, d0
-+ vsubl.u8 q0, d5, d1 @ Add set up
-+ vld1.8 {q2}, [ip :128] @ {15,14,13...0}
-+
-+@ u8 15..0 [1] q2
-+@ u8 left[y] [1] [r2]
-+@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+ vadd.i16 q3, q1
-+ vadd.i16 q8, q0
-+1:
-+ vadd.i16 q10, q3, q1
-+ subs r1, #2
-+ vld1.8 {d18[]}, [r2]!
-+ vadd.i16 q11, q8, q0
-+ vld1.8 {d19[]}, [r2]!
-+ vmlal.u8 q3, d4, d18
-+ vmlal.u8 q8, d5, d18
-+ vadd.i16 q12, q10, q1
-+ vmlal.u8 q10, d4, d19
-+ vadd.i16 q13, q11, q0
-+ vmlal.u8 q11, d5, d19
-+ vrshrn.u16 d18, q3, #5
-+ vrshrn.u16 d19, q8, #5
-+ vmov q3, q12
-+ vst1.8 {q9}, [r0 :128], r3
-+ vrshrn.u16 d18, q10, #5
-+ vrshrn.u16 d19, q11, #5
-+ vmov q8, q13
-+ vst1.8 {q9}, [r0 :128], r3
-+ bne 1b
-+
-+ bx lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_16_neon_10, export=1
-+
-+ @ Load from bytes & expand later - at the very least this uses less
-+ @ memory than having a short table
-+ adr ip, nb_15_0_1_16 + 16
-+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
-+ add r2, #32
-+ vld1.8 {q2}, [ip :128] @ {1,2,3...16}
-+ lsl r3, #1
-+ vld1.16 {d6[],d7[]}, [r1] @ Top (right)
-+ sub ip, #16
-+ vmovl.u8 q8, d4
-+ mov r1, #16
-+ vshl.i16 q9, q0, #4
-+ vmovl.u8 q2, d5
-+ vshl.i16 q10, q1, #4
-+ vld1.16 {d22[],d23[]}, [r2] @ Left (lower)
-+ sub r2, #32
-+ vld1.8 {q12}, [ip] @ {15,14,13...0}
-+ vmla.i16 q9, q8, q3
-+ vmla.i16 q10, q2, q3 @ Acc set up
-+ vsub.i16 q0, q11, q0
-+ vsub.i16 q1, q11, q1 @ Add set up
-+ vadd.i16 q2, q9, q0
-+ vadd.i16 q3, q10, q1
-+ vmovl.u8 q8, d24
-+ vmovl.u8 q9, d25
-+
-+@ u16 15..0 [2] q8,q9
-+@ u32 left[y] [2] [r2]
-+@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+ vadd.i16 q10, q2, q0
-+ subs r1, #2
-+ vld1.16 {d24[],d25[]}, [r2]!
-+ vadd.i16 q11, q3, q1
-+ vld1.16 {d28[],d29[]}, [r2]!
-+ vmla.i16 q2, q8, q12
-+ vmla.i16 q3, q9, q12
-+ vadd.i16 q12, q10, q0
-+ vmla.i16 q10, q8, q14
-+ vadd.i16 q13, q11, q1
-+ vmla.i16 q11, q9, q14
-+ vrshr.u16 q14, q2, #5
-+ vrshr.u16 q15, q3, #5
-+ vmov q2, q12
-+ vst1.16 {q14-q15}, [r0 :128], r3
-+ vrshr.u16 q14, q10, #5
-+ vrshr.u16 q15, q11, #5
-+ vmov q3, q13
-+ vst1.16 {q14-q15}, [r0 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_8, export=1
-+
-+ vld1.8 {q0-q1}, [r1 :128]! @ Top (left)
-+ adr ip, nb_31_0_1_32 + 32
-+ vpush {d8-d12}
-+ vld1.8 {q2-q3}, [ip :128] @ {1,2,3...32}
-+ add r2, #32
-+ vld1.8 {d8[]}, [r1] @ Top (right)
-+ sub ip, #32
-+ vshll.u8 q8, d0, #5
-+ mov r1, #32
-+ vld1.8 {d9[]}, [r2] @ Left (lower)
-+ sub r2, #32
-+ vshll.u8 q9, d1, #5
-+ vshll.u8 q10, d2, #5
-+ vshll.u8 q11, d3, #5
-+ vmlal.u8 q8, d4, d8
-+ vsubl.u8 q12, d9, d0
-+ vmlal.u8 q9, d5, d8
-+ vsubl.u8 q13, d9, d1
-+ vmlal.u8 q10, d6, d8
-+ vsubl.u8 q14, d9, d2
-+ vmlal.u8 q11, d7, d8 @ Acc set up
-+ vsubl.u8 q15, d9, d3 @ Add set up
-+ vadd.i16 q8, q12
-+ vadd.i16 q9, q13
-+ vadd.i16 q10, q14
-+ vadd.i16 q11, q15
-+ vld1.8 {q4-q5}, [ip :128] @ {31,30,29...0}
-+
-+@ u8 31..0 [2] q4,q5
-+@ u8 left[y] [2] [r2]
-+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+ vld1.8 {d12[]}, [r2]!
-+ vadd.i16 q0, q8, q12
-+ b 2f
-+1:
-+ vld1.8 {d12[]}, [r2]!
-+ vrshrn.u16 d3, q1, #6
-+ vrshrn.u16 d2, q0, #6
-+ vadd.i16 q0, q8, q12
-+ vrshrn.u16 d4, q2, #6
-+ vrshrn.u16 d5, q3, #6
-+ vst1.8 {q1-q2}, [r0 :128], r3
-+2: vadd.i16 q1, q9, q13
-+ subs r1, #2
-+ vadd.i16 q2, q10, q14
-+ vadd.i16 q3, q11, q15
-+ vmlal.u8 q8, d8, d12
-+ vmlal.u8 q9, d9, d12
-+ vmlal.u8 q10, d10, d12
-+ vmlal.u8 q11, d11, d12
-+ vld1.8 {d12[]}, [r2]!
-+ vrshrn.u16 d19, q9, #6
-+ vrshrn.u16 d18, q8, #6
-+ vadd.i16 q8, q0, q12
-+ vrshrn.u16 d20, q10, #6
-+ vrshrn.u16 d21, q11, #6
-+ vst1.8 {q9-q10}, [r0 :128], r3
-+ vadd.i16 q9, q1, q13
-+ vadd.i16 q10, q2, q14
-+ vadd.i16 q11, q3, q15
-+ vmlal.u8 q0, d8, d12
-+ vmlal.u8 q1, d9, d12
-+ vmlal.u8 q2, d10, d12
-+ vmlal.u8 q3, d11, d12
-+
-+ bne 1b
-+
-+ vpop {d8-d12}
-+
-+ vrshrn.u16 d3, q1, #6
-+ vrshrn.u16 d2, q0, #6
-+ vrshrn.u16 d4, q2, #6
-+ vrshrn.u16 d5, q3, #6
-+ vst1.8 {q1-q2}, [r0 :128]
-+
-+ bx lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_32_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_32_neon_10, export=1
-+
-+ @ Load from bytes & expand later - at the very least this uses less
-+ @ memory than having a short table
-+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
-+ adr ip, nb_31_0_1_32 + 32
-+ vpush {q4-q7}
-+ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre)
-+ add r2, #64
-+ vld1.8 {q14-q15}, [ip :128] @ {1,2,3...32}
-+T lsl r3, #1
-+ vld1.16 {d8[],d9[]}, [r1] @ Top (right)
-+ sub ip, #32
-+ vmovl.u8 q12, d28
-+ mov r1, #32
-+ vmovl.u8 q13, d29
-+ vld1.8 {q6-q7}, [ip :128] @ {31,30,29...0}
-+ vmovl.u8 q14, d30
-+ vmovl.u8 q15, d31
-+ vld1.16 {d10[],d11[]}, [r2] @ Left (lower)
-+ sub r2, #64
-+ vshl.i16 q8, q0, #5
-+ vshl.i16 q9, q1, #5
-+ vshl.i16 q10, q2, #5
-+ vshl.i16 q11, q3, #5
-+ vmla.i16 q8, q12, q4
-+ vsub.i16 q0, q5, q0
-+ vmla.i16 q9, q13, q4
-+ vsub.i16 q1, q5, q1
-+ vmla.i16 q10, q14, q4
-+ vmov.u16 ip, d0[0]
-+ vsub.i16 q2, q5, q2
-+ vmla.i16 q11, q15, q4 @ Acc set up
-+ vsub.i16 q3, q5, q3 @ Add set up
-+ vadd.i16 q8, q0
-+ vadd.i16 q9, q1
-+ vadd.i16 q10, q2
-+ vadd.i16 q11, q3
-+ vmovl.u8 q4, d12
-+ vmovl.u8 q5, d13
-+ vmovl.u8 q6, d14
-+ vmovl.u8 q7, d15
-+
-+@ u16 31..0 [4] q4-q7
-+@ u16 left[y] [4] [r2]
-+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1]
-+
-+ vadd.i16 q12, q8, q0
-+A sub r0, r0, r3, lsl #1
-+T sub r0, r3
-+1:
-+ vld1.16 {d0[0]}, [r2]!
-+A add r0, r0, r3, lsl #1
-+T add r0, r3
-+ vadd.i16 q13, q9, q1
-+ subs r1, #2
-+ vadd.i16 q14, q10, q2
-+ vadd.i16 q15, q11, q3
-+ vmla.i16 q8, q4, d0[0]
-+ vmla.i16 q9, q5, d0[0]
-+ vmla.i16 q10, q6, d0[0]
-+ vmla.i16 q11, q7, d0[0]
-+ vmov.16 d0[0], ip
-+ vrshr.u16 q8, #6
-+ vrshr.u16 q9, #6
-+ vrshr.u16 q10, #6
-+ vrshr.u16 q11, #6
-+ vstm r0, {q8-q11}
-+ vadd.i16 q8, q12, q0
-+A add r0, r0, r3, lsl #1
-+T add r0, r3
-+ vld1.16 {d0[0]}, [r2]!
-+ vadd.i16 q9, q13, q1
-+ vadd.i16 q10, q14, q2
-+ vadd.i16 q11, q15, q3
-+ vmla.i16 q12, q4, d0[0]
-+ vmla.i16 q13, q5, d0[0]
-+ vmla.i16 q14, q6, d0[0]
-+ vmla.i16 q15, q7, d0[0]
-+ vmov.16 d0[0], ip
-+ vrshr.u16 q12, #6
-+ vrshr.u16 q13, #6
-+ vrshr.u16 q14, #6
-+ vrshr.u16 q15, #6
-+ vstm r0, {q12-q15}
-+ vadd.i16 q12, q8, q0
-+ bne 1b
-+
-+ vpop {q4-q7}
-+ bx lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_8, export=1
-+
-+ vld1.8 {q0}, [r1] @ Top
-+ adr ip, nbx2_3_0_1_4
-+ vldr d2, [r2, #8] @ Left (lower)
-+ mov r1, #4
-+ vldr d3, [ip, #8] @ {1,1,2,2,3,3,4,4}
-+ lsl r3, #1
-+ vshll.u8 q2, d0, #2
-+ vdup.16 d1, d1[0] @ {t4,t4,t4,t4,t4,t4,t4,t4}
-+ vdup.16 d2, d2[0] @ {l4,l4,l4,l4,l4,l4,l4,l4}
-+ vldr d6, [r2] @ Left (upper)
-+ vmlal.u8 q2, d3, d1
-+ vsubl.u8 q0, d2, d0
-+ vldr d7, [ip] @ {3,3,2,2,1,1,0,0}
-+
-+@ u8 3..0 [1] d7
-+@ u8 left[y] [1] d6
-+@ u16 acc [2] q2 (even rows) or q8 (odd rows) = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [2] q0 = p[-1][nTbs] - p[x][-1]
-+
-+ vdup.16 d2, d6[0]
-+ vadd.i16 q2, q0
-+ vdup.16 d3, d6[1]
-+ vadd.i16 q8, q2, q0
-+1:
-+ vmlal.u8 q2, d7, d2
-+ subs r1, #2
-+ vadd.i16 q9, q8, q0
-+ vmlal.u8 q8, d7, d3
-+ vdup.16 d2, d6[2]
-+ vdup.16 d3, d6[3]
-+ vrshrn.i16 d20, q2, #3
-+ vmov q2, q9
-+ vst1.8 {d20}, [r0], r3
-+ vrshrn.i16 d20, q8, #3
-+ vadd.i16 q8, q2, q0
-+ vst1.8 {d20}, [r0], r3
-+ bne 1b
-+
-+ bx lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_4_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_4_neon_10, export=1
-+
-+ adr ip, nbx2_3_0_1_4
-+ vld1.16 {q0}, [r1 :128]! @ Top (left)
-+ lsl r3, #2
-+ vld1.16 {q1}, [ip :128] @ {3,3,2,2,1,1,0,0,1,1,2,2,3,3,4,4}
-+ add ip, r2, #16
-+ vld1.32 {d4[],d5[]}, [r1] @ Top (right)
-+ vshl.s16 q3, q0, #2
-+ vmovl.u8 q8, d3 @ {1,1,2,2,3,3,4,4}
-+ vld1.32 {d18[],d19[]}, [ip] @ Left (lower)
-+ vmla.i16 q3, q8, q2 @ Acc set up
-+ vsub.i16 q0, q9, q0 @ Add set up
-+ vmovl.u8 q1, d2 @ {3,3,2,2,1,1,0,0}
-+ vadd.i16 q2, q3, q0
-+
-+@ u16 3..0 [1] q1
-+@ u32 left[y] [1] [r2]
-+@ u16 acc [1] q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [1] q0 = p[-1][nTbs] - p[x][-1]
-+
-+ vld1.32 {d6[],d7[]}, [r2]!
-+ vadd.i16 q8, q2, q0
-+ vld1.32 {d18[],d19[]}, [r2]!
-+ vmla.i16 q2, q1, q3
-+ vadd.i16 q3, q8, q0
-+ vmla.i16 q8, q1, q9
-+
-+ vrshr.u16 q9, q2, #3
-+ vmov q2, q3
-+ vrshr.u16 q10, q8, #3
-+ vld1.32 {d6[],d7[]}, [r2]!
-+ vst1.16 {q9}, [r0 :128], r3
-+ vadd.i16 q8, q2, q0
-+ vld1.32 {d18[],d19[]}, [r2]!
-+ vmla.i16 q2, q1, q3
-+ vadd.i16 q3, q8, q0
-+ vmla.i16 q8, q1, q9
-+ vst1.16 {q10}, [r0 :128], r3
-+
-+ vrshr.u16 q9, q2, #3
-+ add r3, r0
-+ vrshr.u16 q10, q8, #3
-+ vst1.16 {q9}, [r0 :128]
-+ vst1.16 {q10}, [r3 :128]
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_8, export=1
-+
-+ adr ip, nbx2_7_0_1_8 + 16
-+ vld1.8 {q0}, [r1 :128]! @ Top (left)
-+ add r2, #16
-+ vld1.8 {q1}, [ip: 128] @ {1,1,2,2,3,3...8,8}
-+ lsl r3, #1
-+ vld1.16 {d4[]}, [r1] @ Top (right)
-+ sub ip, #16
-+ vshll.u8 q3, d0, #3
-+ mov r1, #8
-+ vshll.u8 q8, d1, #3
-+ vld1.16 {d5[]}, [r2] @ Left (lower)
-+ sub r2, #16
-+ vmlal.u8 q3, d2, d4
-+ vmlal.u8 q8, d3, d4 @ Acc set up
-+ vsubl.u8 q1, d5, d0
-+ vsubl.u8 q0, d5, d1 @ Add set up
-+ vld1.8 {q2}, [ip :128] @ {7,7,6,6,5,5...0,0}
-+
-+@ u8 7..0 [1] q2
-+@ u8 left[y] [1] [r2]
-+@ u16 acc [2] q3,q8 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [2] q1,q0 = p[-1][nTbs] - p[x][-1]
-+
-+ vadd.i16 q3, q1
-+ vadd.i16 q8, q0
-+1:
-+ vadd.i16 q10, q3, q1
-+ subs r1, #2
-+ vld1.16 {d18[]}, [r2]!
-+ vadd.i16 q11, q8, q0
-+ vld1.16 {d19[]}, [r2]!
-+ vmlal.u8 q3, d4, d18
-+ vmlal.u8 q8, d5, d18
-+ vadd.i16 q12, q10, q1
-+ vmlal.u8 q10, d4, d19
-+ vadd.i16 q13, q11, q0
-+ vmlal.u8 q11, d5, d19
-+ vrshrn.u16 d18, q3, #4
-+ vrshrn.u16 d19, q8, #4
-+ vmov q3, q12
-+ vst1.8 {q9}, [r0 :128], r3
-+ vrshrn.u16 d18, q10, #4
-+ vrshrn.u16 d19, q11, #4
-+ vmov q8, q13
-+ vst1.8 {q9}, [r0 :128], r3
-+ bne 1b
-+
-+ bx lr
-+
-+endfunc
-+
-+
-+@------------------------------------------------------------------------------
-+@
-+@ Data - has to be in two lumps to ensure we can always reach using adr
-+
-+ .balign 64
-+
-+nbx2_15_0_1_16:
-+ .byte 15, 15, 14, 14, 13, 13, 12, 12
-+ .byte 11, 11, 10, 10, 9, 9, 8, 8
-+nbx2_7_0_1_8:
-+ .byte 7, 7, 6, 6, 5, 5, 4, 4
-+ .byte 3, 3, 2, 2, 1, 1, 0, 0
-+ .byte 1, 1, 2, 2, 3, 3, 4, 4
-+ .byte 5, 5, 6, 6, 7, 7, 8, 8
-+ .byte 9, 9, 10, 10, 11, 11, 12, 12
-+ .byte 13, 13, 14, 14, 15, 15, 16, 16
-+
-+ @ should be back on a 64-byte boundary here
-+
-+nbx2_3_0_1_4:
-+ .byte 3, 3, 2, 2, 1, 1, 0, 0
-+ .byte 1, 1, 2, 2, 3, 3, 4, 4
-+
-+@------------------------------------------------------------------------------
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_8_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_8_neon_10, export=1
-+
-+ @ Load from bytes & expand later - at the very least this uses less
-+ @ memory than having a short table
-+ adr ip, nbx2_7_0_1_8 + 16
-+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
-+ add r2, #32
-+ vld1.8 {q2}, [ip :128] @ {1,1,2,2,3,3...8,8}
-+ lsl r3, #2
-+ vld1.32 {d6[],d7[]}, [r1] @ Top (right)
-+ sub ip, #16
-+ vmovl.u8 q8, d4
-+ mov r1, #8
-+ vshl.i16 q9, q0, #3
-+ vmovl.u8 q2, d5
-+ vshl.i16 q10, q1, #3
-+ vld1.32 {d22[],d23[]}, [r2] @ Left (lower)
-+ sub r2, #32
-+ vld1.8 {q12}, [ip] @ {7,7,6,6,5,5...0,0}
-+ vmla.i16 q9, q8, q3
-+ vmla.i16 q10, q2, q3 @ Acc set up
-+ vsub.i16 q0, q11, q0
-+ vsub.i16 q1, q11, q1 @ Add set up
-+ vadd.i16 q2, q9, q0
-+ vadd.i16 q3, q10, q1
-+ vmovl.u8 q8, d24
-+ vmovl.u8 q9, d25
-+
-+@ u16 7..0 [2] q8,q9
-+@ u32 left[y] [2] [r2]
-+@ u16 acc [2] q2,q3 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [2] q0,q1 = p[-1][nTbs] - p[x][-1]
-+
-+1:
-+ vadd.i16 q10, q2, q0
-+ subs r1, #2
-+ vld1.32 {d24[],d25[]}, [r2]!
-+ vadd.i16 q11, q3, q1
-+ vld1.32 {d28[],d29[]}, [r2]!
-+ vmla.i16 q2, q8, q12
-+ vmla.i16 q3, q9, q12
-+ vadd.i16 q12, q10, q0
-+ vmla.i16 q10, q8, q14
-+ vadd.i16 q13, q11, q1
-+ vmla.i16 q11, q9, q14
-+ vrshr.u16 q14, q2, #4
-+ vrshr.u16 q15, q3, #4
-+ vmov q2, q12
-+ vst1.16 {q14-q15}, [r0 :128], r3
-+ vrshr.u16 q14, q10, #4
-+ vrshr.u16 q15, q11, #4
-+ vmov q3, q13
-+ vst1.16 {q14-q15}, [r0 :128], r3
-+ bne 1b
-+
-+ bx lr
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_8
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_8, export=1
-+
-+ vld1.8 {q0-q1}, [r1 :128]! @ Top (left)
-+ adr ip, nbx2_15_0_1_16 + 32
-+ vpush {d8-d12}
-+ vld1.8 {q2-q3}, [ip :128] @ {1,1,2,2,3,3...16,16}
-+ add r2, #32
-+ vld1.16 {d8[]}, [r1] @ Top (right)
-+ sub ip, #32
-+ vshll.u8 q8, d0, #4
-+ mov r1, #16
-+ vld1.16 {d9[]}, [r2] @ Left (lower)
-+ sub r2, #32
-+ vshll.u8 q9, d1, #4
-+ lsl r3, #1
-+ vshll.u8 q10, d2, #4
-+ vshll.u8 q11, d3, #4
-+ vmlal.u8 q8, d4, d8
-+ vsubl.u8 q12, d9, d0
-+ vmlal.u8 q9, d5, d8
-+ vsubl.u8 q13, d9, d1
-+ vmlal.u8 q10, d6, d8
-+ vsubl.u8 q14, d9, d2
-+ vmlal.u8 q11, d7, d8 @ Acc set up
-+ vsubl.u8 q15, d9, d3 @ Add set up
-+ vadd.i16 q8, q12
-+ vadd.i16 q9, q13
-+ vadd.i16 q10, q14
-+ vadd.i16 q11, q15
-+ vld1.8 {q4-q5}, [ip :128] @ {15,15,14,14,13,13...0,0}
-+
-+@ u8 15..0 [2] q4,q5
-+@ u8 left[y] [2] [r2]
-+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [4] q12-q15 = p[-1][nTbs] - p[x][-1]
-+
-+ vld1.16 {d12[]}, [r2]!
-+ vadd.i16 q0, q8, q12
-+ b 2f
-+1:
-+ vld1.16 {d12[]}, [r2]!
-+ vrshrn.u16 d3, q1, #5
-+ vrshrn.u16 d2, q0, #5
-+ vadd.i16 q0, q8, q12
-+ vrshrn.u16 d4, q2, #5
-+ vrshrn.u16 d5, q3, #5
-+ vst1.8 {q1-q2}, [r0 :128], r3
-+2: vadd.i16 q1, q9, q13
-+ subs r1, #2
-+ vadd.i16 q2, q10, q14
-+ vadd.i16 q3, q11, q15
-+ vmlal.u8 q8, d8, d12
-+ vmlal.u8 q9, d9, d12
-+ vmlal.u8 q10, d10, d12
-+ vmlal.u8 q11, d11, d12
-+ vld1.16 {d12[]}, [r2]!
-+ vrshrn.u16 d19, q9, #5
-+ vrshrn.u16 d18, q8, #5
-+ vadd.i16 q8, q0, q12
-+ vrshrn.u16 d20, q10, #5
-+ vrshrn.u16 d21, q11, #5
-+ vst1.8 {q9-q10}, [r0 :128], r3
-+ vadd.i16 q9, q1, q13
-+ vadd.i16 q10, q2, q14
-+ vadd.i16 q11, q3, q15
-+ vmlal.u8 q0, d8, d12
-+ vmlal.u8 q1, d9, d12
-+ vmlal.u8 q2, d10, d12
-+ vmlal.u8 q3, d11, d12
-+
-+ bne 1b
-+
-+ vpop {d8-d12}
-+
-+ vrshrn.u16 d3, q1, #5
-+ vrshrn.u16 d2, q0, #5
-+ vrshrn.u16 d4, q2, #5
-+ vrshrn.u16 d5, q3, #5
-+ vst1.8 {q1-q2}, [r0 :128]
-+
-+ bx lr
-+
-+endfunc
-+
-+
-+@ ff_hevc_rpi_pred_planar_c_16_neon_10
-+@ uint8_t *_src, [r0]
-+@ const uint8_t *_top, [r1]
-+@ const uint8_t *_left, [r2]
-+@ ptrdiff_t stride) [r3]
-+
-+function ff_hevc_rpi_pred_planar_c_16_neon_10, export=1
-+
-+ @ Load from bytes & expand later - at the very least this uses less
-+ @ memory than having a short table
-+ vld1.16 {q0-q1}, [r1 :128]! @ Top (left)
-+ adr ip, nbx2_15_0_1_16 + 32
-+ vpush {q4-q7}
-+ vld1.16 {q2-q3}, [r1 :128]! @ Top (centre)
-+ add r2, #64
-+ vld1.8 {q14-q15}, [ip :128] @ {1,1,2,2,3,3...16,16}
-+T lsl r3, #2
-+ vld1.32 {d8[],d9[]}, [r1] @ Top (right)
-+ sub ip, #32
-+ vmovl.u8 q12, d28
-+ mov r1, #16
-+ vmovl.u8 q13, d29
-+ vld1.8 {q6-q7}, [ip :128] @ {15,15,14,14,13,13...0,0}
-+ vmovl.u8 q14, d30
-+ vmovl.u8 q15, d31
-+ vld1.32 {d10[],d11[]}, [r2] @ Left (lower)
-+ sub r2, #64
-+ vshl.i16 q8, q0, #4
-+ vshl.i16 q9, q1, #4
-+ vshl.i16 q10, q2, #4
-+ vshl.i16 q11, q3, #4
-+ vmla.i16 q8, q12, q4
-+ vsub.i16 q0, q5, q0
-+ vmla.i16 q9, q13, q4
-+ vpush {q0}
-+ vsub.i16 q1, q5, q1
-+ vmla.i16 q10, q14, q4
-+ vsub.i16 q2, q5, q2
-+ vmla.i16 q11, q15, q4 @ Acc set up
-+ vsub.i16 q3, q5, q3 @ Add set up
-+ vadd.i16 q8, q0
-+ vadd.i16 q9, q1
-+ vadd.i16 q10, q2
-+ vadd.i16 q11, q3
-+ vmovl.u8 q4, d12
-+ vmovl.u8 q5, d13
-+ vmovl.u8 q6, d14
-+ vmovl.u8 q7, d15
-+
-+@ u16 31..0 [4] q4-q7
-+@ u16 left[y] [4] [r2]
-+@ u16 acc [4] q8-q11 = (x+1)*p[nTbS][-1] + 32*p[x][-1] initially
-+@ u16 add [4] q0-q3 = p[-1][nTbs] - p[x][-1]
-+
-+ vadd.i16 q12, q8, q0
-+A sub r0, r0, r3, lsl #2
-+T sub r0, r3
-+1:
-+ vld1.32 {d0[],d1[]}, [r2]!
-+A add r0, r0, r3, lsl #2
-+T add r0, r3
-+ vadd.i16 q13, q9, q1
-+ subs r1, #2
-+ vadd.i16 q14, q10, q2
-+ vadd.i16 q15, q11, q3
-+ vmla.i16 q8, q4, q0
-+ vmla.i16 q9, q5, q0
-+ vmla.i16 q10, q6, q0
-+ vmla.i16 q11, q7, q0
-+ vld1.16 {q0}, [sp]
-+ vrshr.u16 q8, #5
-+ vrshr.u16 q9, #5
-+ vrshr.u16 q10, #5
-+ vrshr.u16 q11, #5
-+ vstm r0, {q8-q11}
-+ vadd.i16 q8, q12, q0
-+A add r0, r0, r3, lsl #2
-+T add r0, r3
-+ vld1.32 {d0[],d1[]}, [r2]!
-+ vadd.i16 q9, q13, q1
-+ vadd.i16 q10, q14, q2
-+ vadd.i16 q11, q15, q3
-+ vmla.i16 q12, q4, q0
-+ vmla.i16 q13, q5, q0
-+ vmla.i16 q14, q6, q0
-+ vmla.i16 q15, q7, q0
-+ vld1.16 {q0}, [sp]
-+ vrshr.u16 q12, #5
-+ vrshr.u16 q13, #5
-+ vrshr.u16 q14, #5
-+ vrshr.u16 q15, #5
-+ vstm r0, {q12-q15}
-+ vadd.i16 q12, q8, q0
-+ bne 1b
-+
-+ vpop {q3-q7}
-+ bx lr
-+
-+endfunc
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index fb0c6fae70..9f2ebb16f3 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -3208,7 +3208,13 @@ typedef struct AVCodecContext {
- #endif
-
- /**
-- * Audio only. The amount of padding (in samples) appended by the encoder to
-+ * Opaque pointer for use by replacement get_buffer2 code
-+ *
-+ * @author jc (08/02/2016)
-+ */
-+ void * get_buffer_context;
-+
-+ /* Audio only. The amount of padding (in samples) appended by the encoder to
- * the end of the audio. I.e. this number of decoded samples must be
- * discarded by the caller from the end of the stream to get the original
- * audio without any trailing padding.
-@@ -4593,6 +4599,17 @@ void av_packet_rescale_ts(AVPacket *pkt, AVRational tb_src, AVRational tb_dst);
- */
- AVCodec *avcodec_find_decoder(enum AVCodecID id);
-
-+/**
-+ * Find a registered decoder with a matching codec ID and pix_fmt.
-+ * A decoder will pix_fmt set to NULL will match any fmt.
-+ * A fmt of AV_PIX_FMT_NONE will only match a decoder will px_fmt NULL.
-+ *
-+ * @param id AVCodecID of the requested decoder
-+ * @param fmt AVPixelForma that msut be supported by decoder
-+ * @return A decoder if one was found, NULL otherwise.
-+ */
-+AVCodec *avcodec_find_decoder_by_id_and_fmt(enum AVCodecID id, enum AVPixelFormat fmt);
-+
- /**
- * Find a registered decoder with the specified name.
- *
-diff --git a/libavcodec/cabac.h b/libavcodec/cabac.h
-index 1bf1c620d6..ccfa991f60 100644
---- a/libavcodec/cabac.h
-+++ b/libavcodec/cabac.h
-@@ -43,7 +43,14 @@ extern const uint8_t ff_h264_cabac_tables[512 + 4*2*64 + 4*64 + 63];
- typedef struct CABACContext{
- int low;
- int range;
-- int outstanding_count;
-+ union
-+ {
-+ int outstanding_count;
-+ struct {
-+ uint16_t bits;
-+ uint16_t range;
-+ } by22;
-+ };
- const uint8_t *bytestream_start;
- const uint8_t *bytestream;
- const uint8_t *bytestream_end;
-diff --git a/libavcodec/mmaldec.c b/libavcodec/mmaldec.c
-index 647a22ef7c..4ed35d1126 100644
---- a/libavcodec/mmaldec.c
-+++ b/libavcodec/mmaldec.c
-@@ -24,6 +24,9 @@
- * MMAL Video Decoder
- */
-
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
- #include <bcm_host.h>
- #include <interface/mmal/mmal.h>
- #include <interface/mmal/mmal_parameters_video.h>
-@@ -31,6 +34,7 @@
- #include <interface/mmal/util/mmal_util_params.h>
- #include <interface/mmal/util/mmal_default_components.h>
- #include <interface/mmal/vc/mmal_vc_api.h>
-+#pragma GCC diagnostic pop
- #include <stdatomic.h>
-
- #include "avcodec.h"
-diff --git a/libavcodec/raw.c b/libavcodec/raw.c
-index 8da2a9735e..9089f9b4ea 100644
---- a/libavcodec/raw.c
-+++ b/libavcodec/raw.c
-@@ -283,6 +283,10 @@ const PixelFormatTag ff_raw_pix_fmt_tags[] = {
- { AV_PIX_FMT_YUV444P16LE, MKTAG('I', '4', 'F', 'L') },
- { AV_PIX_FMT_YUV444P16BE, MKTAG('I', '4', 'F', 'B') },
-
-+ /* RPI (Might as well define for everything) */
-+ { AV_PIX_FMT_SAND128, MKTAG('S', 'A', 'N', 'D') },
-+ { AV_PIX_FMT_SAND64_10, MKTAG('S', 'N', 'D', 'A') },
-+
- /* special */
- { AV_PIX_FMT_RGB565LE,MKTAG( 3 , 0 , 0 , 0 ) }, /* flipped RGB565LE */
- { AV_PIX_FMT_YUV444P, MKTAG('Y', 'V', '2', '4') }, /* YUV444P, swapped UV */
-diff --git a/libavcodec/rawenc.c b/libavcodec/rawenc.c
-index d181b74570..c52c450956 100644
---- a/libavcodec/rawenc.c
-+++ b/libavcodec/rawenc.c
-@@ -24,6 +24,7 @@
- * Raw Video Encoder
- */
-
-+#include "config.h"
- #include "avcodec.h"
- #include "raw.h"
- #include "internal.h"
-@@ -31,6 +32,10 @@
- #include "libavutil/intreadwrite.h"
- #include "libavutil/imgutils.h"
- #include "libavutil/internal.h"
-+#include "libavutil/avassert.h"
-+#if CONFIG_SAND
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
-
- static av_cold int raw_encode_init(AVCodecContext *avctx)
- {
-@@ -49,6 +54,55 @@ FF_ENABLE_DEPRECATION_WARNINGS
- return 0;
- }
-
-+#if CONFIG_SAND
-+static int raw_sand8_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+ const AVFrame *frame)
-+{
-+ const int width = av_frame_cropped_width(frame);
-+ const int height = av_frame_cropped_height(frame);
-+ const int x0 = frame->crop_left;
-+ const int y0 = frame->crop_top;
-+ const int size = width * height * 3 / 2;
-+ uint8_t * dst;
-+ int ret;
-+
-+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+ return ret;
-+
-+ dst = pkt->data;
-+
-+ av_rpi_sand_to_planar_y8(dst, width, frame->data[0], frame->linesize[0], frame->linesize[3], x0, y0, width, height);
-+ dst += width * height;
-+ av_rpi_sand_to_planar_c8(dst, width / 2, dst + width * height / 4, width / 2,
-+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0 / 2, y0 / 2, width / 2, height / 2);
-+ return 0;
-+}
-+
-+static int raw_sand16_as_yuv420(AVCodecContext *avctx, AVPacket *pkt,
-+ const AVFrame *frame)
-+{
-+ const int width = av_frame_cropped_width(frame);
-+ const int height = av_frame_cropped_height(frame);
-+ const int x0 = frame->crop_left;
-+ const int y0 = frame->crop_top;
-+ const int size = width * height * 3;
-+ uint8_t * dst;
-+ int ret;
-+
-+ if ((ret = ff_alloc_packet2(avctx, pkt, size, size)) < 0)
-+ return ret;
-+
-+ dst = pkt->data;
-+
-+ av_rpi_sand_to_planar_y16(dst, width * 2, frame->data[0], frame->linesize[0], frame->linesize[3], x0 * 2, y0, width * 2, height);
-+ dst += width * height * 2;
-+ av_rpi_sand_to_planar_c16(dst, width, dst + width * height / 2, width,
-+ frame->data[1], frame->linesize[1], av_rpi_sand_frame_stride2(frame), x0, y0 / 2, width, height / 2);
-+ return 0;
-+}
-+#endif
-+
-+
- static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
- const AVFrame *frame, int *got_packet)
- {
-@@ -58,6 +112,14 @@ static int raw_encode(AVCodecContext *avctx, AVPacket *pkt,
- if (ret < 0)
- return ret;
-
-+#if CONFIG_SAND
-+ if (av_rpi_is_sand_frame(frame)) {
-+ ret = av_rpi_is_sand8_frame(frame) ? raw_sand8_as_yuv420(avctx, pkt, frame) : raw_sand16_as_yuv420(avctx, pkt, frame);
-+ *got_packet = (ret == 0);
-+ return ret;
-+ }
-+#endif
-+
- if ((ret = ff_alloc_packet2(avctx, pkt, ret, ret)) < 0)
- return ret;
- if ((ret = av_image_copy_to_buffer(pkt->data, pkt->size,
-diff --git a/libavcodec/rpi_hevc_cabac.c b/libavcodec/rpi_hevc_cabac.c
-new file mode 100644
-index 0000000000..552c2e349e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac.c
-@@ -0,0 +1,2255 @@
-+/*
-+ * HEVC CABAC decoding
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#define UNCHECKED_BITSTREAM_READER 1
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+
-+#include "cabac_functions.h"
-+#include "rpi_hevc_data.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// BY22 is probably faster than simple bypass if the processor has
-+// either a fast 32-bit divide or a fast 32x32->64[63:32] instruction
-+// x86 has fast int divide
-+// Arm doesn't have divide or general fast 64 bit, but does have the multiply
-+// * Beware: ARCH_xxx isn't set if configure --disable-asm is used
-+#define USE_BY22 (HAVE_FAST_64BIT || ARCH_ARM || ARCH_X86)
-+// Use native divide if we have a fast one - otherwise use mpy 1/x
-+// x86 has a fast integer divide - arm doesn't - unsure about other
-+// architectures
-+#define USE_BY22_DIV ARCH_X86
-+
-+// Special case blocks with a single significant ceoff
-+// Decreases the complexity of the code for a common case but increases the
-+// code size.
-+#define USE_N_END_1 1
-+
-+#if !USE_BY22_DIV
-+// * 1/x @ 32 bits gets us 22 bits of accuracy
-+#define CABAC_BY22_PEEK_BITS 22
-+#else
-+// A real 32-bit divide gets us another bit
-+// If we have a 64 bit int & a unit time divider then we should get a lot
-+// of bits (55) but that is untested and it is unclear if it would give
-+// us a large advantage
-+#define CABAC_BY22_PEEK_BITS 23
-+#endif
-+
-+#define CABAC_MAX_BIN 31
-+
-+
-+#if USE_BY22 && !USE_BY22_DIV
-+#define I(x) (uint32_t)((0x10000000000ULL / (uint64_t)(x)) + 1ULL)
-+
-+static const uint32_t cabac_by22_inv_range[256] = {
-+ 0, I(257), I(258), I(259),
-+ I(260), I(261), I(262), I(263), I(264), I(265), I(266), I(267), I(268), I(269),
-+ I(270), I(271), I(272), I(273), I(274), I(275), I(276), I(277), I(278), I(279),
-+ I(280), I(281), I(282), I(283), I(284), I(285), I(286), I(287), I(288), I(289),
-+ I(290), I(291), I(292), I(293), I(294), I(295), I(296), I(297), I(298), I(299),
-+ I(300), I(301), I(302), I(303), I(304), I(305), I(306), I(307), I(308), I(309),
-+ I(310), I(311), I(312), I(313), I(314), I(315), I(316), I(317), I(318), I(319),
-+ I(320), I(321), I(322), I(323), I(324), I(325), I(326), I(327), I(328), I(329),
-+ I(330), I(331), I(332), I(333), I(334), I(335), I(336), I(337), I(338), I(339),
-+ I(340), I(341), I(342), I(343), I(344), I(345), I(346), I(347), I(348), I(349),
-+ I(350), I(351), I(352), I(353), I(354), I(355), I(356), I(357), I(358), I(359),
-+ I(360), I(361), I(362), I(363), I(364), I(365), I(366), I(367), I(368), I(369),
-+ I(370), I(371), I(372), I(373), I(374), I(375), I(376), I(377), I(378), I(379),
-+ I(380), I(381), I(382), I(383), I(384), I(385), I(386), I(387), I(388), I(389),
-+ I(390), I(391), I(392), I(393), I(394), I(395), I(396), I(397), I(398), I(399),
-+ I(400), I(401), I(402), I(403), I(404), I(405), I(406), I(407), I(408), I(409),
-+ I(410), I(411), I(412), I(413), I(414), I(415), I(416), I(417), I(418), I(419),
-+ I(420), I(421), I(422), I(423), I(424), I(425), I(426), I(427), I(428), I(429),
-+ I(430), I(431), I(432), I(433), I(434), I(435), I(436), I(437), I(438), I(439),
-+ I(440), I(441), I(442), I(443), I(444), I(445), I(446), I(447), I(448), I(449),
-+ I(450), I(451), I(452), I(453), I(454), I(455), I(456), I(457), I(458), I(459),
-+ I(460), I(461), I(462), I(463), I(464), I(465), I(466), I(467), I(468), I(469),
-+ I(470), I(471), I(472), I(473), I(474), I(475), I(476), I(477), I(478), I(479),
-+ I(480), I(481), I(482), I(483), I(484), I(485), I(486), I(487), I(488), I(489),
-+ I(490), I(491), I(492), I(493), I(494), I(495), I(496), I(497), I(498), I(499),
-+ I(500), I(501), I(502), I(503), I(504), I(505), I(506), I(507), I(508), I(509),
-+ I(510), I(511)
-+};
-+#undef I
-+#endif // USE_BY22
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_cabac.h"
-+#endif
-+
-+/**
-+ * number of bin by SyntaxElement.
-+ */
-+static const int8_t num_bins_in_se[] = {
-+ 1, // sao_merge_flag
-+ 1, // sao_type_idx
-+ 0, // sao_eo_class
-+ 0, // sao_band_position
-+ 0, // sao_offset_abs
-+ 0, // sao_offset_sign
-+ 0, // end_of_slice_flag
-+ 3, // split_coding_unit_flag
-+ 1, // cu_transquant_bypass_flag
-+ 3, // skip_flag
-+ 3, // cu_qp_delta
-+ 1, // pred_mode
-+ 4, // part_mode
-+ 0, // pcm_flag
-+ 1, // prev_intra_luma_pred_mode
-+ 0, // mpm_idx
-+ 0, // rem_intra_luma_pred_mode
-+ 2, // intra_chroma_pred_mode
-+ 1, // merge_flag
-+ 1, // merge_idx
-+ 5, // inter_pred_idc
-+ 2, // ref_idx_l0
-+ 2, // ref_idx_l1
-+ 2, // abs_mvd_greater0_flag
-+ 2, // abs_mvd_greater1_flag
-+ 0, // abs_mvd_minus2
-+ 0, // mvd_sign_flag
-+ 1, // mvp_lx_flag
-+ 1, // no_residual_data_flag
-+ 3, // split_transform_flag
-+ 2, // cbf_luma
-+ 4, // cbf_cb, cbf_cr
-+ 2, // transform_skip_flag[][]
-+ 2, // explicit_rdpcm_flag[][]
-+ 2, // explicit_rdpcm_dir_flag[][]
-+ 18, // last_significant_coeff_x_prefix
-+ 18, // last_significant_coeff_y_prefix
-+ 0, // last_significant_coeff_x_suffix
-+ 0, // last_significant_coeff_y_suffix
-+ 4, // significant_coeff_group_flag
-+ 44, // significant_coeff_flag
-+ 24, // coeff_abs_level_greater1_flag
-+ 6, // coeff_abs_level_greater2_flag
-+ 0, // coeff_abs_level_remaining
-+ 0, // coeff_sign_flag
-+ 8, // log2_res_scale_abs
-+ 2, // res_scale_sign_flag
-+ 1, // cu_chroma_qp_offset_flag
-+ 1, // cu_chroma_qp_offset_idx
-+};
-+
-+/**
-+ * Offset to ctxIdx 0 in init_values and states, indexed by SyntaxElement.
-+ */
-+static const int elem_offset[sizeof(num_bins_in_se)] = {
-+ 0, // sao_merge_flag
-+ 1, // sao_type_idx
-+ 2, // sao_eo_class
-+ 2, // sao_band_position
-+ 2, // sao_offset_abs
-+ 2, // sao_offset_sign
-+ 2, // end_of_slice_flag
-+ 2, // split_coding_unit_flag
-+ 5, // cu_transquant_bypass_flag
-+ 6, // skip_flag
-+ 9, // cu_qp_delta
-+ 12, // pred_mode
-+ 13, // part_mode
-+ 17, // pcm_flag
-+ 17, // prev_intra_luma_pred_mode
-+ 18, // mpm_idx
-+ 18, // rem_intra_luma_pred_mode
-+ 18, // intra_chroma_pred_mode
-+ 20, // merge_flag
-+ 21, // merge_idx
-+ 22, // inter_pred_idc
-+ 27, // ref_idx_l0
-+ 29, // ref_idx_l1
-+ 31, // abs_mvd_greater0_flag
-+ 33, // abs_mvd_greater1_flag
-+ 35, // abs_mvd_minus2
-+ 35, // mvd_sign_flag
-+ 35, // mvp_lx_flag
-+ 36, // no_residual_data_flag
-+ 37, // split_transform_flag
-+ 40, // cbf_luma
-+ 42, // cbf_cb, cbf_cr
-+ 46, // transform_skip_flag[][]
-+ 48, // explicit_rdpcm_flag[][]
-+ 50, // explicit_rdpcm_dir_flag[][]
-+ 52, // last_significant_coeff_x_prefix
-+ 70, // last_significant_coeff_y_prefix
-+ 88, // last_significant_coeff_x_suffix
-+ 88, // last_significant_coeff_y_suffix
-+ 88, // significant_coeff_group_flag
-+ 92, // significant_coeff_flag
-+ 136, // coeff_abs_level_greater1_flag
-+ 160, // coeff_abs_level_greater2_flag
-+ 166, // coeff_abs_level_remaining
-+ 166, // coeff_sign_flag
-+ 166, // log2_res_scale_abs
-+ 174, // res_scale_sign_flag
-+ 176, // cu_chroma_qp_offset_flag
-+ 177, // cu_chroma_qp_offset_idx
-+};
-+
-+#define CNU 154
-+/**
-+ * Indexed by init_type
-+ */
-+static const uint8_t init_values[3][HEVC_CONTEXTS] = {
-+ { // sao_merge_flag
-+ 153,
-+ // sao_type_idx
-+ 200,
-+ // split_coding_unit_flag
-+ 139, 141, 157,
-+ // cu_transquant_bypass_flag
-+ 154,
-+ // skip_flag
-+ CNU, CNU, CNU,
-+ // cu_qp_delta
-+ 154, 154, 154,
-+ // pred_mode
-+ CNU,
-+ // part_mode
-+ 184, CNU, CNU, CNU,
-+ // prev_intra_luma_pred_mode
-+ 184,
-+ // intra_chroma_pred_mode
-+ 63, 139,
-+ // merge_flag
-+ CNU,
-+ // merge_idx
-+ CNU,
-+ // inter_pred_idc
-+ CNU, CNU, CNU, CNU, CNU,
-+ // ref_idx_l0
-+ CNU, CNU,
-+ // ref_idx_l1
-+ CNU, CNU,
-+ // abs_mvd_greater1_flag
-+ CNU, CNU,
-+ // abs_mvd_greater1_flag
-+ CNU, CNU,
-+ // mvp_lx_flag
-+ CNU,
-+ // no_residual_data_flag
-+ CNU,
-+ // split_transform_flag
-+ 153, 138, 138,
-+ // cbf_luma
-+ 111, 141,
-+ // cbf_cb, cbf_cr
-+ 94, 138, 182, 154,
-+ // transform_skip_flag
-+ 139, 139,
-+ // explicit_rdpcm_flag
-+ 139, 139,
-+ // explicit_rdpcm_dir_flag
-+ 139, 139,
-+ // last_significant_coeff_x_prefix
-+ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+ 79, 108, 123, 63,
-+ // last_significant_coeff_y_prefix
-+ 110, 110, 124, 125, 140, 153, 125, 127, 140, 109, 111, 143, 127, 111,
-+ 79, 108, 123, 63,
-+ // significant_coeff_group_flag
-+ 91, 171, 134, 141,
-+ // significant_coeff_flag
-+ 111, 111, 125, 110, 110, 94, 124, 108, 124, 107, 125, 141, 179, 153,
-+ 125, 107, 125, 141, 179, 153, 125, 107, 125, 141, 179, 153, 125, 140,
-+ 139, 182, 182, 152, 136, 152, 136, 153, 136, 139, 111, 136, 139, 111,
-+ 141, 111,
-+ // coeff_abs_level_greater1_flag
-+ 140, 92, 137, 138, 140, 152, 138, 139, 153, 74, 149, 92, 139, 107,
-+ 122, 152, 140, 179, 166, 182, 140, 227, 122, 197,
-+ // coeff_abs_level_greater2_flag
-+ 138, 153, 136, 167, 152, 152,
-+ // log2_res_scale_abs
-+ 154, 154, 154, 154, 154, 154, 154, 154,
-+ // res_scale_sign_flag
-+ 154, 154,
-+ // cu_chroma_qp_offset_flag
-+ 154,
-+ // cu_chroma_qp_offset_idx
-+ 154,
-+ },
-+ { // sao_merge_flag
-+ 153,
-+ // sao_type_idx
-+ 185,
-+ // split_coding_unit_flag
-+ 107, 139, 126,
-+ // cu_transquant_bypass_flag
-+ 154,
-+ // skip_flag
-+ 197, 185, 201,
-+ // cu_qp_delta
-+ 154, 154, 154,
-+ // pred_mode
-+ 149,
-+ // part_mode
-+ 154, 139, 154, 154,
-+ // prev_intra_luma_pred_mode
-+ 154,
-+ // intra_chroma_pred_mode
-+ 152, 139,
-+ // merge_flag
-+ 110,
-+ // merge_idx
-+ 122,
-+ // inter_pred_idc
-+ 95, 79, 63, 31, 31,
-+ // ref_idx_l0
-+ 153, 153,
-+ // ref_idx_l1
-+ 153, 153,
-+ // abs_mvd_greater1_flag
-+ 140, 198,
-+ // abs_mvd_greater1_flag
-+ 140, 198,
-+ // mvp_lx_flag
-+ 168,
-+ // no_residual_data_flag
-+ 79,
-+ // split_transform_flag
-+ 124, 138, 94,
-+ // cbf_luma
-+ 153, 111,
-+ // cbf_cb, cbf_cr
-+ 149, 107, 167, 154,
-+ // transform_skip_flag
-+ 139, 139,
-+ // explicit_rdpcm_flag
-+ 139, 139,
-+ // explicit_rdpcm_dir_flag
-+ 139, 139,
-+ // last_significant_coeff_x_prefix
-+ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
-+ 94, 108, 123, 108,
-+ // last_significant_coeff_y_prefix
-+ 125, 110, 94, 110, 95, 79, 125, 111, 110, 78, 110, 111, 111, 95,
-+ 94, 108, 123, 108,
-+ // significant_coeff_group_flag
-+ 121, 140, 61, 154,
-+ // significant_coeff_flag
-+ 155, 154, 139, 153, 139, 123, 123, 63, 153, 166, 183, 140, 136, 153,
-+ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+ 153, 123, 123, 107, 121, 107, 121, 167, 151, 183, 140, 151, 183, 140,
-+ 140, 140,
-+ // coeff_abs_level_greater1_flag
-+ 154, 196, 196, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+ 136, 137, 169, 194, 166, 167, 154, 167, 137, 182,
-+ // coeff_abs_level_greater2_flag
-+ 107, 167, 91, 122, 107, 167,
-+ // log2_res_scale_abs
-+ 154, 154, 154, 154, 154, 154, 154, 154,
-+ // res_scale_sign_flag
-+ 154, 154,
-+ // cu_chroma_qp_offset_flag
-+ 154,
-+ // cu_chroma_qp_offset_idx
-+ 154,
-+ },
-+ { // sao_merge_flag
-+ 153,
-+ // sao_type_idx
-+ 160,
-+ // split_coding_unit_flag
-+ 107, 139, 126,
-+ // cu_transquant_bypass_flag
-+ 154,
-+ // skip_flag
-+ 197, 185, 201,
-+ // cu_qp_delta
-+ 154, 154, 154,
-+ // pred_mode
-+ 134,
-+ // part_mode
-+ 154, 139, 154, 154,
-+ // prev_intra_luma_pred_mode
-+ 183,
-+ // intra_chroma_pred_mode
-+ 152, 139,
-+ // merge_flag
-+ 154,
-+ // merge_idx
-+ 137,
-+ // inter_pred_idc
-+ 95, 79, 63, 31, 31,
-+ // ref_idx_l0
-+ 153, 153,
-+ // ref_idx_l1
-+ 153, 153,
-+ // abs_mvd_greater1_flag
-+ 169, 198,
-+ // abs_mvd_greater1_flag
-+ 169, 198,
-+ // mvp_lx_flag
-+ 168,
-+ // no_residual_data_flag
-+ 79,
-+ // split_transform_flag
-+ 224, 167, 122,
-+ // cbf_luma
-+ 153, 111,
-+ // cbf_cb, cbf_cr
-+ 149, 92, 167, 154,
-+ // transform_skip_flag
-+ 139, 139,
-+ // explicit_rdpcm_flag
-+ 139, 139,
-+ // explicit_rdpcm_dir_flag
-+ 139, 139,
-+ // last_significant_coeff_x_prefix
-+ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
-+ 79, 108, 123, 93,
-+ // last_significant_coeff_y_prefix
-+ 125, 110, 124, 110, 95, 94, 125, 111, 111, 79, 125, 126, 111, 111,
-+ 79, 108, 123, 93,
-+ // significant_coeff_group_flag
-+ 121, 140, 61, 154,
-+ // significant_coeff_flag
-+ 170, 154, 139, 153, 139, 123, 123, 63, 124, 166, 183, 140, 136, 153,
-+ 154, 166, 183, 140, 136, 153, 154, 166, 183, 140, 136, 153, 154, 170,
-+ 153, 138, 138, 122, 121, 122, 121, 167, 151, 183, 140, 151, 183, 140,
-+ 140, 140,
-+ // coeff_abs_level_greater1_flag
-+ 154, 196, 167, 167, 154, 152, 167, 182, 182, 134, 149, 136, 153, 121,
-+ 136, 122, 169, 208, 166, 167, 154, 152, 167, 182,
-+ // coeff_abs_level_greater2_flag
-+ 107, 167, 91, 107, 107, 167,
-+ // log2_res_scale_abs
-+ 154, 154, 154, 154, 154, 154, 154, 154,
-+ // res_scale_sign_flag
-+ 154, 154,
-+ // cu_chroma_qp_offset_flag
-+ 154,
-+ // cu_chroma_qp_offset_idx
-+ 154,
-+ },
-+};
-+
-+static const uint8_t scan_1x1[1] = {
-+ 0,
-+};
-+
-+static const uint8_t horiz_scan2x2_x[4] = {
-+ 0, 1, 0, 1,
-+};
-+
-+static const uint8_t horiz_scan2x2_y[4] = {
-+ 0, 0, 1, 1
-+};
-+
-+static const uint8_t horiz_scan4x4_x[16] = {
-+ 0, 1, 2, 3,
-+ 0, 1, 2, 3,
-+ 0, 1, 2, 3,
-+ 0, 1, 2, 3,
-+};
-+
-+static const uint8_t horiz_scan4x4_y[16] = {
-+ 0, 0, 0, 0,
-+ 1, 1, 1, 1,
-+ 2, 2, 2, 2,
-+ 3, 3, 3, 3,
-+};
-+
-+static const uint8_t horiz_scan8x8_inv[8][8] = {
-+ { 0, 1, 2, 3, 16, 17, 18, 19, },
-+ { 4, 5, 6, 7, 20, 21, 22, 23, },
-+ { 8, 9, 10, 11, 24, 25, 26, 27, },
-+ { 12, 13, 14, 15, 28, 29, 30, 31, },
-+ { 32, 33, 34, 35, 48, 49, 50, 51, },
-+ { 36, 37, 38, 39, 52, 53, 54, 55, },
-+ { 40, 41, 42, 43, 56, 57, 58, 59, },
-+ { 44, 45, 46, 47, 60, 61, 62, 63, },
-+};
-+
-+static const uint8_t diag_scan2x2_x[4] = {
-+ 0, 0, 1, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_y[4] = {
-+ 0, 1, 0, 1,
-+};
-+
-+static const uint8_t diag_scan2x2_inv[2][2] = {
-+ { 0, 2, },
-+ { 1, 3, },
-+};
-+
-+static const uint8_t diag_scan4x4_inv[4][4] = {
-+ { 0, 2, 5, 9, },
-+ { 1, 4, 8, 12, },
-+ { 3, 7, 11, 14, },
-+ { 6, 10, 13, 15, },
-+};
-+
-+static const uint8_t diag_scan8x8_inv[8][8] = {
-+ { 0, 2, 5, 9, 14, 20, 27, 35, },
-+ { 1, 4, 8, 13, 19, 26, 34, 42, },
-+ { 3, 7, 12, 18, 25, 33, 41, 48, },
-+ { 6, 11, 17, 24, 32, 40, 47, 53, },
-+ { 10, 16, 23, 31, 39, 46, 52, 57, },
-+ { 15, 22, 30, 38, 45, 51, 56, 60, },
-+ { 21, 29, 37, 44, 50, 55, 59, 62, },
-+ { 28, 36, 43, 49, 54, 58, 61, 63, },
-+};
-+
-+
-+typedef struct
-+{
-+ uint16_t coeff;
-+ uint16_t scale;
-+} xy_off_t;
-+
-+#define XYT_C(x,y,t) ((x) + ((y) << (t)))
-+#define SCALE_TRAFO(t) ((t) > 3 ? 3 : (t))
-+#define SCALE_SHR(t) ((t) - SCALE_TRAFO(t))
-+#define XYT_S(x,y,t) (((x) >> SCALE_SHR(t)) + (((y) >> SCALE_SHR(t)) << SCALE_TRAFO(t)))
-+
-+#define XYT(x,y,t) {XYT_C(x,y,t), XYT_S(x,y,t)}
-+
-+#define OFF_DIAG(t) {\
-+ XYT(0,0,t), XYT(0,1,t), XYT(1,0,t), XYT(0,2,t),\
-+ XYT(1,1,t), XYT(2,0,t), XYT(0,3,t), XYT(1,2,t),\
-+ XYT(2,1,t), XYT(3,0,t), XYT(1,3,t), XYT(2,2,t),\
-+ XYT(3,1,t), XYT(2,3,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_HORIZ(t) {\
-+ XYT(0,0,t), XYT(1,0,t), XYT(2,0,t), XYT(3,0,t),\
-+ XYT(0,1,t), XYT(1,1,t), XYT(2,1,t), XYT(3,1,t),\
-+ XYT(0,2,t), XYT(1,2,t), XYT(2,2,t), XYT(3,2,t),\
-+ XYT(0,3,t), XYT(1,3,t), XYT(2,3,t), XYT(3,3,t)\
-+}
-+
-+#define OFF_VERT(t) {\
-+ XYT(0,0,t), XYT(0,1,t), XYT(0,2,t), XYT(0,3,t),\
-+ XYT(1,0,t), XYT(1,1,t), XYT(1,2,t), XYT(1,3,t),\
-+ XYT(2,0,t), XYT(2,1,t), XYT(2,2,t), XYT(2,3,t),\
-+ XYT(3,0,t), XYT(3,1,t), XYT(3,2,t), XYT(3,3,t)\
-+}
-+
-+static const xy_off_t off_xys[3][4][16] =
-+{
-+ {OFF_DIAG(2), OFF_DIAG(3), OFF_DIAG(4), OFF_DIAG(5)},
-+ {OFF_HORIZ(2), OFF_HORIZ(3), OFF_HORIZ(4), OFF_HORIZ(5)},
-+ {OFF_VERT(2), OFF_VERT(3), OFF_VERT(4), OFF_VERT(5)}
-+};
-+
-+
-+// Helper fns
-+#ifndef hevc_mem_bits32
-+static av_always_inline uint32_t hevc_mem_bits32(const void * buf, const unsigned int offset)
-+{
-+ return AV_RB32((const uint8_t *)buf + (offset >> 3)) << (offset & 7);
-+}
-+#endif
-+
-+#if AV_GCC_VERSION_AT_LEAST(3,4) && !defined(hevc_clz32)
-+#define hevc_clz32 hevc_clz32_builtin
-+static av_always_inline unsigned int hevc_clz32_builtin(const uint32_t x)
-+{
-+ // __builtin_clz says it works on ints - so adjust if int is >32 bits long
-+ return __builtin_clz(x) - (sizeof(int) * 8 - 32);
-+}
-+#endif
-+
-+// It is unlikely that we will ever need this but include for completeness
-+#ifndef hevc_clz32
-+static inline unsigned int hevc_clz32(unsigned int x)
-+{
-+ unsigned int n = 1;
-+ if ((x & 0xffff0000) == 0) {
-+ n += 16;
-+ x <<= 16;
-+ }
-+ if ((x & 0xff000000) == 0) {
-+ n += 8;
-+ x <<= 8;
-+ }
-+ if ((x & 0xf0000000) == 0) {
-+ n += 4;
-+ x <<= 4;
-+ }
-+ if ((x & 0xc0000000) == 0) {
-+ n += 2;
-+ x <<= 2;
-+ }
-+ return n - ((x >> 31) & 1);
-+}
-+#endif
-+
-+static inline int cabac_overflow(const CABACContext * const cc)
-+{
-+ av_assert0(cc->bytestream >= cc->bytestream_start);
-+ return cc->bytestream >= cc->bytestream_end + 4;
-+}
-+
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc)
-+{
-+ return cabac_overflow(&lc->cc);
-+}
-+
-+#if !USE_BY22
-+// If no by22 then _by22 functions will revert to normal and so _peek/_flush
-+// will no longer be called but the setup calls will still exist and we want
-+// to null them out
-+#define bypass_start(s)
-+#define bypass_finish(s)
-+#else
-+// Use BY22 for residual bypass block
-+
-+#define bypass_start(cc) get_cabac_by22_start(cc)
-+#define bypass_finish(cc) get_cabac_by22_finish(cc)
-+
-+// BY22 notes that bypass is simply a divide into the bitstream and so we
-+// can peek out large quantities of bits at once and treat the result as if
-+// it was VLC. In many cases this will lead to O(1) processing rather than
-+// O(n) though the setup and teardown is sufficiently expensive that it is
-+// only worth using if we expect to be dealing with more than a few bits
-+// The definition of "a few bits" will vary from platform to platform but
-+// tests on ARM show that it probably isn't worth it for a single coded
-+// residual, but is for >1 - it also seems likely that if there are
-+// more residuals then they are likely to be bigger and this will make the
-+// O(1) nature of the code more worthwhile.
-+
-+
-+// Bypass block start
-+// Must be called before _by22_peek is used as it sets the CABAC environment
-+// into the correct state. _by22_finish must be called to return to 'normal'
-+// (i.e. non-bypass) cabac decoding
-+#ifndef get_cabac_by22_start
-+static inline void get_cabac_by22_start(CABACContext * const c)
-+{
-+ const unsigned int bits = __builtin_ctz(c->low);
-+ const uint32_t m = hevc_mem_bits32(c->bytestream, 0);
-+ uint32_t x = (c->low << (22 - CABAC_BITS)) ^ ((m ^ 0x80000000U) >> (9 + CABAC_BITS - bits));
-+#if !USE_BY22_DIV
-+ const uint32_t inv = cabac_by22_inv_range[c->range & 0xff];
-+#endif
-+
-+ c->bytestream -= (CABAC_BITS / 8);
-+ c->by22.bits = bits;
-+#if !USE_BY22_DIV
-+ c->by22.range = c->range;
-+ c->range = inv;
-+#endif
-+ c->low = x;
-+}
-+#endif
-+
-+// Bypass block finish
-+// Must be called at the end of the bypass block to return to normal operation
-+static inline void get_cabac_by22_finish(CABACContext * const c)
-+{
-+ unsigned int used = c->by22.bits;
-+ unsigned int bytes_used = (used / CABAC_BITS) * (CABAC_BITS / 8);
-+ unsigned int bits_used = used & (CABAC_BITS == 16 ? 15 : 7);
-+
-+ c->bytestream += bytes_used + (CABAC_BITS / 8);
-+ c->low = (((uint32_t)c->low >> (22 - CABAC_BITS + bits_used)) | 1) << bits_used;
-+#if !USE_BY22_DIV
-+ c->range = c->by22.range;
-+#endif
-+}
-+
-+// Peek bypass bits
-+// _by22_start must be called before _by22_peek is called and _by22_flush
-+// must be called afterwards to flush any used bits
-+// The actual number of valid bits returned is
-+// min(<coded bypass block length>, CABAC_BY22_PEEK_BITS). CABAC_BY22_PEEK_BITS
-+// will be at least 22 which should be long enough for any prefix or suffix
-+// though probably not long enough for the worst case combination
-+#ifndef get_cabac_by22_peek
-+static inline uint32_t get_cabac_by22_peek(const CABACContext * const c)
-+{
-+#if USE_BY22_DIV
-+ return ((unsigned int)c->low / (unsigned int)c->range) << 9;
-+#else
-+ uint32_t x = c->low & ~1U;
-+ const uint32_t inv = c->range;
-+
-+ if (inv != 0)
-+ x = (uint32_t)(((uint64_t)x * (uint64_t)inv) >> 32);
-+
-+ return x << 1;
-+#endif
-+}
-+#endif
-+
-+// Flush bypass bits peeked by _by22_peek
-+// Flush n bypass bits. n must be >= 1 to guarantee correct operation
-+// val is an unmodified copy of whatever _by22_peek returned
-+#ifndef get_cabac_by22_flush
-+static inline void get_cabac_by22_flush(CABACContext * c, const unsigned int n, const uint32_t val)
-+{
-+ // Subtract the bits used & reshift up to the top of the word
-+#if USE_BY22_DIV
-+ const uint32_t low = (((unsigned int)c->low << n) - (((val >> (32 - n)) * (unsigned int)c->range) << 23));
-+#else
-+ const uint32_t low = (((uint32_t)c->low << n) - (((val >> (32 - n)) * c->by22.range) << 23));
-+#endif
-+
-+ // and refill lower bits
-+ // We will probably OR over some existing bits but that doesn't matter
-+ c->by22.bits += n;
-+ c->low = low | (hevc_mem_bits32(c->bytestream, c->by22.bits) >> 9);
-+}
-+#endif
-+
-+#endif // USE_BY22
-+
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc)
-+{
-+ memcpy(s->cabac_save->rice, lc->stat_coeff, 4);
-+ memcpy(s->cabac_save->state, lc->cabac_state, HEVC_CONTEXTS);
-+}
-+
-+static void load_states(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ memcpy(lc->stat_coeff, s->cabac_save->rice, 4);
-+ memcpy(lc->cabac_state, s->cabac_save->state, HEVC_CONTEXTS);
-+}
-+
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc)
-+{
-+ GetBitContext * const gb = &lc->gb;
-+ skip_bits(gb, 1);
-+ align_get_bits(gb);
-+ return ff_init_cabac_decoder(&lc->cc,
-+ gb->buffer + get_bits_count(gb) / 8,
-+ (get_bits_left(gb) + 7) / 8);
-+}
-+
-+static void cabac_init_state(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ int init_type = 2 - s->sh.slice_type;
-+ int i;
-+
-+ if (s->sh.cabac_init_flag && s->sh.slice_type != HEVC_SLICE_I)
-+ init_type ^= 3;
-+
-+ for (i = 0; i < HEVC_CONTEXTS; i++) {
-+ int init_value = init_values[init_type][i];
-+ int m = (init_value >> 4) * 5 - 45;
-+ int n = ((init_value & 15) << 3) - 16;
-+ int pre = 2 * (((m * av_clip(s->sh.slice_qp, 0, 51)) >> 4) + n) - 127;
-+
-+ pre ^= pre >> 31;
-+ if (pre > 124)
-+ pre = 124 + (pre & 1);
-+ lc->cabac_state[i] = pre;
-+ }
-+
-+ for (i = 0; i < 4; i++)
-+ lc->stat_coeff[i] = 0;
-+}
-+
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags)
-+{
-+ if (lc->cabac_init_req == 1 || (ctb_flags & CTB_TS_FLAGS_CIREQ) != 0)
-+ {
-+ lc->qPy_pred = s->sh.slice_qp;
-+ cabac_init_state(s, lc);
-+ }
-+ else if ((ctb_flags & CTB_TS_FLAGS_CLOAD) != 0)
-+ {
-+ lc->qPy_pred = s->sh.slice_qp;
-+ load_states(s, lc);
-+ }
-+ lc->cabac_init_req = 0;
-+}
-+
-+#define GET_CABAC_LC(ctx) get_cabac(&lc->cc, lc->cabac_state + (ctx))
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state)
-+{
-+ return get_cabac_inline(c, state);
-+}
-+
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c)
-+{
-+ return get_cabac_terminate(c);
-+}
-+
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+ if (!GET_CABAC_LC(elem_offset[SAO_TYPE_IDX]))
-+ return 0;
-+
-+ if (!get_cabac_bypass(&lc->cc))
-+ return SAO_BAND;
-+ return SAO_EDGE;
-+}
-+
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc)
-+{
-+ int i;
-+ int value = get_cabac_bypass(&lc->cc);
-+
-+ for (i = 0; i < 4; i++)
-+ value = (value << 1) | get_cabac_bypass(&lc->cc);
-+ return value;
-+}
-+
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ int i = 0;
-+ int length = (1 << (FFMIN(s->ps.sps->bit_depth, 10) - 5)) - 1;
-+
-+ while (i < length && get_cabac_bypass(&lc->cc))
-+ i++;
-+ return i;
-+}
-+
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return get_cabac_bypass(&lc->cc);
-+}
-+
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc)
-+{
-+ int ret = get_cabac_bypass(&lc->cc) << 1;
-+ ret |= get_cabac_bypass(&lc->cc);
-+ return ret;
-+}
-+
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc)
-+{
-+ int val = 1;
-+
-+ if (get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA) == 0)
-+ return 0;
-+
-+ while (val < 5 &&
-+ get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_QP_DELTA + 1) != 0)
-+ val++;
-+
-+ if (val >= 5) {
-+ unsigned int k = 0;
-+ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+ val += 1 << k;
-+ k++;
-+ }
-+// if (k == CABAC_MAX_BIN)
-+// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+
-+ while (k--)
-+ val += get_cabac_bypass(&lc->cc) << k;
-+ }
-+ return get_cabac_bypass(&lc->cc) ? -val : val;
-+}
-+
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ int c_max= FFMAX(5, s->ps.pps->chroma_qp_offset_list_len_minus1);
-+ int i = 0;
-+
-+ while (i < c_max && GET_CABAC_LC(elem_offset[CU_CHROMA_QP_OFFSET_IDX]))
-+ i++;
-+
-+ return i;
-+}
-+
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size)
-+{
-+ if (GET_CABAC_LC(elem_offset[PART_MODE])) // 1
-+ return PART_2Nx2N;
-+ if (log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+ if (lc->cu.pred_mode == MODE_INTRA) // 0
-+ return PART_NxN;
-+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+ return PART_2NxN;
-+ if (log2_cb_size == 3) // 00
-+ return PART_Nx2N;
-+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 2)) // 001
-+ return PART_Nx2N;
-+ return PART_NxN; // 000
-+ }
-+
-+ if (!s->ps.sps->amp_enabled_flag) {
-+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) // 01
-+ return PART_2NxN;
-+ return PART_Nx2N;
-+ }
-+
-+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 1)) { // 01X, 01XX
-+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 011
-+ return PART_2NxN;
-+ if (get_cabac_bypass(&lc->cc)) // 0101
-+ return PART_2NxnD;
-+ return PART_2NxnU; // 0100
-+ }
-+
-+ if (GET_CABAC_LC(elem_offset[PART_MODE] + 3)) // 001
-+ return PART_Nx2N;
-+ if (get_cabac_bypass(&lc->cc)) // 0001
-+ return PART_nRx2N;
-+ return PART_nLx2N; // 0000
-+}
-+
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc)
-+{
-+ int i = 0;
-+ while (i < 2 && get_cabac_bypass(&lc->cc))
-+ i++;
-+ return i;
-+}
-+
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+ int i;
-+ int value = get_cabac_bypass(&lc->cc);
-+
-+ for (i = 0; i < 4; i++)
-+ value = (value << 1) | get_cabac_bypass(&lc->cc);
-+ return value;
-+}
-+
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+ int ret;
-+ if (!GET_CABAC_LC(elem_offset[INTRA_CHROMA_PRED_MODE]))
-+ return 4;
-+
-+ ret = get_cabac_bypass(&lc->cc) << 1;
-+ ret |= get_cabac_bypass(&lc->cc);
-+ return ret;
-+}
-+
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ int i = GET_CABAC_LC(elem_offset[MERGE_IDX]);
-+
-+ if (i != 0) {
-+ while (i < s->sh.max_num_merge_cand-1 && get_cabac_bypass(&lc->cc))
-+ i++;
-+ }
-+ return i;
-+}
-+
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH)
-+{
-+ if (nPbW + nPbH == 12)
-+ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+ if (GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + lc->ct_depth))
-+ return PRED_BI;
-+
-+ return GET_CABAC_LC(elem_offset[INTER_PRED_IDC] + 4);
-+}
-+
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx)
-+{
-+ int i = 0;
-+ int max = num_ref_idx_lx - 1;
-+ int max_ctx = FFMIN(max, 2);
-+
-+ while (i < max_ctx && GET_CABAC_LC(elem_offset[REF_IDX_L0] + i))
-+ i++;
-+ if (i == 2) {
-+ while (i < max && get_cabac_bypass(&lc->cc))
-+ i++;
-+ }
-+
-+ return i;
-+}
-+
-+static av_always_inline int abs_mvd_greater0_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER0_FLAG]);
-+}
-+
-+static av_always_inline int abs_mvd_greater1_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return GET_CABAC_LC(elem_offset[ABS_MVD_GREATER1_FLAG] + 1);
-+}
-+
-+#if !USE_BY22
-+static av_always_inline int mvd_decode(HEVCRpiLocalContext * const lc)
-+{
-+ int ret = 2;
-+ int k = 1;
-+
-+ while (k < CABAC_MAX_BIN && get_cabac_bypass(&lc->cc)) {
-+ ret += 1U << k;
-+ k++;
-+ }
-+ if (k == CABAC_MAX_BIN) {
-+ av_log(NULL, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", k);
-+ return 0;
-+ }
-+
-+ while (k--)
-+ ret += get_cabac_bypass(&lc->cc) << k;
-+ return get_cabac_bypass_sign(&lc->cc, -ret);
-+}
-+#endif
-+
-+static av_always_inline int mvd_sign_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return get_cabac_bypass_sign(&lc->cc, -1);
-+}
-+
-+static int hevc_transform_skip_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+ return GET_CABAC_LC(elem_offset[TRANSFORM_SKIP_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_FLAG] + c_idx_nz);
-+}
-+
-+static int explicit_rdpcm_dir_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz)
-+{
-+ return GET_CABAC_LC(elem_offset[EXPLICIT_RDPCM_DIR_FLAG] + c_idx_nz);
-+}
-+
-+
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx) {
-+ int i =0;
-+
-+ while (i < 4 && GET_CABAC_LC(elem_offset[LOG2_RES_SCALE_ABS] + 4 * idx + i))
-+ i++;
-+
-+ return i;
-+}
-+
-+static av_always_inline void last_significant_coeff_xy_prefix_decode(HEVCRpiLocalContext * const lc, int c_idx_nz,
-+ int log2_size, int *last_scx_prefix, int *last_scy_prefix)
-+{
-+ int i = 0;
-+ int max = (log2_size << 1) - 1;
-+ int ctx_offset, ctx_shift;
-+
-+ if (!c_idx_nz) {
-+ ctx_offset = 3 * (log2_size - 2) + ((log2_size - 1) >> 2);
-+ ctx_shift = (log2_size + 1) >> 2;
-+ } else {
-+ ctx_offset = 15;
-+ ctx_shift = log2_size - 2;
-+ }
-+ while (i < max &&
-+ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_X_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+ i++;
-+ *last_scx_prefix = i;
-+
-+ i = 0;
-+ while (i < max &&
-+ GET_CABAC_LC(elem_offset[LAST_SIGNIFICANT_COEFF_Y_PREFIX] + (i >> ctx_shift) + ctx_offset))
-+ i++;
-+ *last_scy_prefix = i;
-+}
-+
-+static av_always_inline int last_significant_coeff_suffix_decode(HEVCRpiLocalContext * const lc,
-+ int last_significant_coeff_prefix)
-+{
-+ int i;
-+ int length = (last_significant_coeff_prefix >> 1) - 1;
-+ int value = get_cabac_bypass(&lc->cc);
-+
-+ for (i = 1; i < length; i++)
-+ value = (value << 1) | get_cabac_bypass(&lc->cc);
-+ return value;
-+}
-+
-+static av_always_inline int significant_coeff_group_flag_decode(HEVCRpiLocalContext * const lc, int c_idx_nz, int ctx_cg)
-+{
-+ int inc;
-+
-+ inc = (ctx_cg != 0) + (c_idx_nz << 1);
-+
-+ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_GROUP_FLAG] + inc);
-+}
-+
-+static av_always_inline int significant_coeff_flag_decode_0(HEVCRpiLocalContext * const lc, int offset)
-+{
-+ return GET_CABAC_LC(elem_offset[SIGNIFICANT_COEFF_FLAG] + offset);
-+}
-+
-+#if !USE_BY22
-+#define coeff_abs_level_remaining_decode_bypass(s,r) coeff_abs_level_remaining_decode(s, r)
-+#endif
-+
-+
-+#ifndef coeff_abs_level_remaining_decode_bypass
-+static int coeff_abs_level_remaining_decode_bypass(CABACContext * const c, const unsigned int rice_param)
-+{
-+ uint32_t y;
-+ unsigned int prefix;
-+ unsigned int last_coeff_abs_level_remaining;
-+ unsigned int n;
-+
-+ y = get_cabac_by22_peek(c);
-+ prefix = hevc_clz32(~y);
-+ // y << prefix will always have top bit 0
-+
-+ if (prefix < 3) {
-+ const unsigned int suffix = (y << prefix) >> (31 - rice_param);
-+ last_coeff_abs_level_remaining = (prefix << rice_param) + suffix;
-+ n = prefix + 1 + rice_param;
-+ }
-+ else if (prefix * 2 + rice_param <= CABAC_BY22_PEEK_BITS + 2)
-+ {
-+ const uint32_t suffix = ((y << prefix) | 0x80000000) >> (34 - (prefix + rice_param));
-+
-+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+ n = prefix * 2 + rice_param - 2;
-+ }
-+ else {
-+ unsigned int suffix;
-+
-+ get_cabac_by22_flush(c, prefix, y);
-+ y = get_cabac_by22_peek(c);
-+
-+ suffix = (y | 0x80000000) >> (34 - (prefix + rice_param));
-+ last_coeff_abs_level_remaining = (2 << rice_param) + suffix;
-+ n = prefix + rice_param - 2;
-+ }
-+
-+ get_cabac_by22_flush(c, n, y);
-+
-+ return last_coeff_abs_level_remaining;
-+}
-+#endif
-+
-+static int coeff_abs_level_remaining_decode(CABACContext * const c, int rc_rice_param)
-+{
-+ int prefix = 0;
-+ int suffix = 0;
-+ int last_coeff_abs_level_remaining;
-+ int i;
-+
-+ while (prefix < CABAC_MAX_BIN && get_cabac_bypass(c))
-+ prefix++;
-+ if (prefix == CABAC_MAX_BIN) {
-+// av_log(s->avctx, AV_LOG_ERROR, "CABAC_MAX_BIN : %d\n", prefix);
-+ return 0;
-+ }
-+
-+ if (prefix < 3) {
-+ for (i = 0; i < rc_rice_param; i++)
-+ suffix = (suffix << 1) | get_cabac_bypass(c);
-+ last_coeff_abs_level_remaining = (prefix << rc_rice_param) + suffix;
-+ } else {
-+ int prefix_minus3 = prefix - 3;
-+ for (i = 0; i < prefix_minus3 + rc_rice_param; i++)
-+ suffix = (suffix << 1) | get_cabac_bypass(c);
-+ last_coeff_abs_level_remaining = (((1 << prefix_minus3) + 3 - 1)
-+ << rc_rice_param) + suffix;
-+ }
-+
-+ return last_coeff_abs_level_remaining;
-+}
-+
-+#if !USE_BY22
-+#define coeff_sign_flag_decode_bypass coeff_sign_flag_decode
-+static inline uint32_t coeff_sign_flag_decode(CABACContext * const c, const unsigned int nb)
-+{
-+ unsigned int i;
-+ uint32_t ret = 0;
-+
-+ for (i = 0; i < nb; i++)
-+ ret = (ret << 1) | get_cabac_bypass(c);
-+
-+ return ret << (32 - nb);
-+}
-+#endif
-+
-+#ifndef coeff_sign_flag_decode_bypass
-+static inline uint32_t coeff_sign_flag_decode_bypass(CABACContext * const c, const unsigned int nb)
-+{
-+ uint32_t y;
-+ y = get_cabac_by22_peek(c);
-+ get_cabac_by22_flush(c, nb, y);
-+ return y & ~(0xffffffffU >> nb);
-+}
-+#endif
-+
-+
-+#ifndef get_cabac_greater1_bits
-+static inline unsigned int get_cabac_greater1_bits(CABACContext * const c, const unsigned int n,
-+ uint8_t * const state0)
-+{
-+ unsigned int i;
-+ unsigned int rv = 0;
-+ for (i = 0; i != n; ++i) {
-+ const unsigned int idx = rv != 0 ? 0 : i < 3 ? i + 1 : 3;
-+ const unsigned int b = get_cabac(c, state0 + idx);
-+ rv = (rv << 1) | b;
-+ }
-+ return rv;
-+}
-+#endif
-+
-+
-+// N.B. levels returned are the values assuming coeff_abs_level_remaining
-+// is uncoded, so 1 must be added if it is coded. sum_abs also reflects
-+// this version of events.
-+static inline uint32_t get_greaterx_bits(HEVCRpiLocalContext * const lc, const unsigned int n_end, int * const levels,
-+ int * const pprev_subset_coded, int * const psum,
-+ const unsigned int idx0_gt1, const unsigned int idx_gt2)
-+{
-+ CABACContext * const c = &lc->cc;
-+ uint8_t * const state0 = lc->cabac_state + idx0_gt1;
-+ uint8_t * const state_gt2 = lc->cabac_state + idx_gt2;
-+ unsigned int rv;
-+ unsigned int i;
-+ const unsigned int n = FFMIN(n_end, 8);
-+
-+ // Really this is i != n but the simple unconditional loop is cheaper
-+ // and faster
-+ for (i = 0; i != 8; ++i)
-+ levels[i] = 1;
-+
-+ rv = get_cabac_greater1_bits(c, n, state0);
-+
-+ *pprev_subset_coded = 0;
-+ *psum = n;
-+
-+ rv <<= (32 - n);
-+ if (rv != 0)
-+ {
-+ *pprev_subset_coded = 1;
-+ *psum = n + 1;
-+ i = hevc_clz32(rv);
-+ levels[i] = 2;
-+ if (get_cabac(c, state_gt2) == 0)
-+ {
-+ // Unset first coded bit
-+ rv &= ~(0x80000000U >> i);
-+ }
-+ }
-+
-+ if (n_end > 8) {
-+ const unsigned int g8 = n_end - 8;
-+ rv |= ((1 << g8) - 1) << (24 - g8);
-+ for (i = 0; i != g8; ++i) {
-+ levels[i + 8] = 0;
-+ }
-+ }
-+
-+ return rv;
-+}
-+
-+// extended_precision_processing_flag must be false given we are
-+// putting the result into a 16-bit array
-+// So trans_coeff_level must fit in 16 bits too (7.4.9.1 definition of coeff_abs_level_remaining)
-+// scale_m is uint8_t
-+//
-+// scale is [40 - 72] << [0..12] based on qp- worst case is (45 << 12)
-+// or it can be 2 (if we have transquant_bypass)
-+// shift is set to one less than we really want but would normally be
-+// s->ps.sps->bit_depth (max 16, min 8) + log2_trafo_size (max 5, min 2?) - 5 = max 16 min 5?
-+// however the scale shift is substracted from shift to a min 0 so scale_m worst = 45 << 6
-+// This can still theoretically lead to overflow but the coding would have to be very odd (& inefficient)
-+// to achieve it
-+
-+#ifndef trans_scale_sat
-+static inline int trans_scale_sat(const int level, const unsigned int scale, const unsigned int scale_m, const unsigned int shift)
-+{
-+ return av_clip_int16((((level * (int)(scale * scale_m)) >> shift) + 1) >> 1);
-+}
-+#endif
-+
-+
-+#ifndef update_rice
-+static inline void update_rice(uint8_t * const stat_coeff,
-+ const unsigned int last_coeff_abs_level_remaining,
-+ const unsigned int c_rice_param)
-+{
-+ const unsigned int x = (last_coeff_abs_level_remaining << 1) >> c_rice_param;
-+ if (x >= 6)
-+ (*stat_coeff)++;
-+ else if (x == 0 && *stat_coeff > 0)
-+ (*stat_coeff)--;
-+}
-+#endif
-+
-+
-+// n must be > 0 on entry
-+#ifndef get_cabac_sig_coeff_flag_idxs
-+static inline uint8_t * get_cabac_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+ unsigned int n,
-+ const uint8_t const * ctx_map,
-+ uint8_t * p)
-+{
-+ do {
-+ if (get_cabac(c, state0 + ctx_map[n]))
-+ *p++ = n;
-+ } while (--n != 0);
-+ return p;
-+}
-+#endif
-+
-+
-+static int get_sig_coeff_flag_idxs(CABACContext * const c, uint8_t * const state0,
-+ unsigned int n,
-+ const uint8_t const * ctx_map,
-+ uint8_t * const flag_idx)
-+{
-+ int rv;
-+
-+ rv = get_cabac_sig_coeff_flag_idxs(c, state0, n, ctx_map, flag_idx) - flag_idx;
-+
-+ return rv;
-+}
-+
-+#define H4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+ x0, x1, x2, x3,\
-+ x4, x5, x6, x7,\
-+ x8, x9, x10, x11,\
-+ x12, x13, x14, x15}
-+
-+#define V4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+ x0, x4, x8, x12,\
-+ x1, x5, x9, x13,\
-+ x2, x6, x10, x14,\
-+ x3, x7, x11, x15}
-+
-+#define D4x4(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {\
-+ x0, x4, x1, x8,\
-+ x5, x2, x12, x9,\
-+ x6, x3, x13, x10,\
-+ x7, x14, x11, x15}
-+
-+
-+static inline int next_subset(HEVCRpiLocalContext * const lc, int i, const int c_idx_nz,
-+ uint8_t * const significant_coeff_group_flag,
-+ const uint8_t * const scan_x_cg, const uint8_t * const scan_y_cg,
-+ int * const pPrev_sig)
-+{
-+ while (--i >= 0) {
-+ uint8_t * const gf_y = scan_y_cg[i] + significant_coeff_group_flag;
-+ const unsigned int x_cg = scan_x_cg[i];
-+
-+ // For the flag decode we only care about Z/NZ but
-+ // we use the full Right * 2 + Down when calculating
-+ // significant coeff flags so we obtain it here.
-+ //
-+ // The group flag array is one longer than it needs to
-+ // be so we don't need to check for y_cg limits
-+ const unsigned int prev_sig = ((gf_y[0] >> x_cg) & 2) | ((gf_y[1] >> x_cg) & 1);
-+
-+ if (i == 0 ||
-+ significant_coeff_group_flag_decode(lc, c_idx_nz, prev_sig))
-+ {
-+ gf_y[0] |= (1 << x_cg);
-+ *pPrev_sig = prev_sig;
-+ break;
-+ }
-+ }
-+
-+ return i;
-+}
-+
-+static void rpi_add_residual(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+ const unsigned int log2_trafo_size, const unsigned int c_idx,
-+ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+ const AVFrame * const frame = s->frame;
-+ const unsigned int stride = frame_stride1(s->frame, c_idx);
-+ const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+ const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+ const int is_sliced = 1; // av_rpi_is_sand_frame(frame);
-+ uint8_t * const dst = !is_sliced ?
-+ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+ c_idx == 0 ?
-+ av_rpi_sand_frame_pos_y(frame, x, y) :
-+ av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+ const unsigned int i = jb->intra.n;
-+ HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+ pc->ta.dst == dst)
-+ {
-+ av_assert1(pc->size == log2_trafo_size &&
-+ pc->c_idx == 1 &&
-+ pc->ta.stride == stride);
-+
-+ pc->type = RPI_PRED_ADD_RESIDUAL_C;
-+ }
-+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+ pc->dc.dst == dst)
-+ {
-+ const int16_t dc = (int16_t)pc->dc.dc; // Discard top bits
-+ av_assert1(pc->size == log2_trafo_size &&
-+ pc->c_idx == 1 &&
-+ pc->dc.stride == stride);
-+
-+ // Rewrite as add residual - must rewrite all fields as different union member
-+ pc->type = RPI_PRED_ADD_RESIDUAL_V;
-+ pc->ta.buf = coeffs;
-+ pc->ta.dst = dst;
-+ pc->ta.stride = stride;
-+ pc->ta.dc = dc;
-+ }
-+ else
-+ {
-+ HEVCPredCmd * const cmd = pc + 1;
-+ jb->intra.n = i + 1;
-+
-+ cmd->type = RPI_PRED_ADD_RESIDUAL + (is_sliced ? c_idx : 0);
-+ cmd->size = log2_trafo_size;
-+ cmd->ta.buf = coeffs;
-+ cmd->ta.dst = dst;
-+ cmd->ta.stride = stride;
-+ cmd->ta.dc = 0;
-+ }
-+}
-+
-+
-+static void rpi_add_dc(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const unsigned int log2_trafo_size, const unsigned int c_idx,
-+ const unsigned int x0, const unsigned int y0, const int16_t * const coeffs)
-+{
-+ const AVFrame * const frame = s->frame;
-+ const unsigned int stride = frame_stride1(s->frame, c_idx);
-+ const unsigned int x = x0 >> ctx_hshift(s, c_idx);
-+ const unsigned int y = y0 >> ctx_vshift(s, c_idx);
-+ const int is_sliced = 1;
-+ uint8_t * const dst = !is_sliced ?
-+ s->frame->data[c_idx] + y * stride + (x << s->ps.sps->pixel_shift) :
-+ c_idx == 0 ?
-+ av_rpi_sand_frame_pos_y(frame, x, y) :
-+ av_rpi_sand_frame_pos_c(frame, x, y);
-+
-+ const unsigned int shift = FFMAX(14 - s->ps.sps->bit_depth, 0);
-+ const int coeff = (coeffs[0] + (1 | (1 << shift))) >> (shift + 1);
-+
-+ const unsigned int i = jb->intra.n;
-+ HEVCPredCmd *const pc = jb->intra.cmds + i - 1;
-+
-+ if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_RESIDUAL_U &&
-+ pc->ta.dst == dst)
-+ {
-+ av_assert1(pc->size == log2_trafo_size &&
-+ pc->c_idx == 1 &&
-+ pc->ta.stride == stride);
-+
-+ pc->ta.dc = (int16_t)coeff;
-+ }
-+ else if (i != 0 && c_idx == 2 && pc->type == RPI_PRED_ADD_DC_U &&
-+ pc->dc.dst == dst)
-+ {
-+ av_assert1(pc->size == log2_trafo_size &&
-+ pc->c_idx == 1 &&
-+ pc->dc.stride == stride &&
-+ (pc->dc.dc & ~0xffff) == 0);
-+
-+ pc->dc.dc |= (coeff << 16);
-+ }
-+ else
-+ {
-+ HEVCPredCmd * const cmd = pc + 1;
-+ jb->intra.n = i + 1;
-+
-+ cmd->type = RPI_PRED_ADD_DC + c_idx;
-+ cmd->size = log2_trafo_size;
-+ cmd->dc.dst = dst;
-+ cmd->dc.stride = stride;
-+ cmd->dc.dc = c_idx == 0 ? coeff : c_idx == 2 ? coeff << 16 : coeff & 0xffff;
-+ }
-+}
-+
-+
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const int x0, const int y0,
-+ const int log2_trafo_size, const enum ScanType scan_idx,
-+ const int c_idx)
-+{
-+ int trans_skip_or_bypass = lc->cu.cu_transquant_bypass_flag;
-+
-+ int last_significant_coeff_x, last_significant_coeff_y;
-+ int num_coeff = 0;
-+ int prev_subset_coded = 0;
-+
-+ int num_last_subset;
-+ int x_cg_last_sig, y_cg_last_sig;
-+
-+ const uint8_t *scan_x_cg, *scan_y_cg;
-+ const xy_off_t * const scan_xy_off = off_xys[scan_idx][log2_trafo_size - 2];
-+
-+ int use_vpu;
-+#if RPI_COMPRESS_COEFFS
-+ int num_nonzero = 0;
-+ int use_compress = 0;
-+ int *coeffs32;
-+#endif
-+ int use_dc = 0;
-+ int16_t *coeffs;
-+ uint8_t significant_coeff_group_flag[9] = {0}; // Allow 1 final byte that is always zero
-+ int explicit_rdpcm_flag = 0;
-+ int explicit_rdpcm_dir_flag;
-+
-+ int i;
-+ int shift,scale;
-+ const uint8_t *scale_matrix = NULL;
-+ uint8_t dc_scale;
-+ const int c_idx_nz = (c_idx != 0);
-+ const int pred_mode_intra = c_idx_nz ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
-+ int prev_sig = 0;
-+ int may_hide_sign;
-+
-+ int16_t dummy_coeffs[16];
-+
-+ // Derive QP for dequant
-+ if (!lc->cu.cu_transquant_bypass_flag) {
-+ may_hide_sign = s->ps.pps->sign_data_hiding_flag;
-+
-+ if (s->ps.pps->transform_skip_enabled_flag &&
-+ log2_trafo_size <= s->ps.pps->log2_max_transform_skip_block_size) {
-+ int transform_skip_flag = hevc_transform_skip_flag_decode(lc, c_idx_nz);
-+ if (transform_skip_flag) {
-+ trans_skip_or_bypass = 1;
-+ if (lc->cu.pred_mode == MODE_INTRA &&
-+ s->ps.sps->implicit_rdpcm_enabled_flag &&
-+ (pred_mode_intra == 10 || pred_mode_intra == 26)) {
-+ may_hide_sign = 0;
-+ }
-+ }
-+ }
-+
-+ {
-+ static const uint8_t level_scale[8] = {
-+ 40, 45, 51, 57, 64, 72, 0, 0 // Pad to 8
-+ };
-+ const int qp6 = (int8_t)lc->tu.qp_divmod6[c_idx][lc->qp_y];
-+
-+ // Shift is set to one less than will actually occur as the scale
-+ // and saturate step adds 1 and then shifts right again
-+ scale = level_scale[qp6 & 7];
-+// shift = s->ps.sps->bit_depth + log2_trafo_size - (int)(qp6 >> 3);
-+ shift = log2_trafo_size - (qp6 >> 3);
-+
-+ if (shift < 0) {
-+ scale <<= -shift;
-+ shift = 0;
-+ }
-+ }
-+
-+ if (s->ps.sps->scaling_list_enable_flag && !(trans_skip_or_bypass && log2_trafo_size > 2)) {
-+ const ScalingList * const sl = s->ps.pps->scaling_list_data_present_flag ?
-+ &s->ps.pps->scaling_list : &s->ps.sps->scaling_list;
-+ const unsigned int matrix_id =
-+ lc->cu.pred_mode != MODE_INTRA ? 3 + c_idx : c_idx;
-+
-+ scale_matrix = sl->sl[log2_trafo_size - 2][matrix_id];
-+ dc_scale = scale_matrix[0];
-+ if (log2_trafo_size >= 4)
-+ dc_scale = sl->sl_dc[log2_trafo_size - 4][matrix_id];
-+ }
-+ else
-+ {
-+ static const uint8_t sixteen_scale[64] = {
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16,
-+ 16, 16, 16, 16, 16, 16, 16, 16
-+ };
-+ scale_matrix = sixteen_scale;
-+ dc_scale = 16;
-+ }
-+ } else {
-+ static const uint8_t unit_scale[64] = {
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ 1, 1, 1, 1, 1, 1, 1, 1,
-+ };
-+ scale_matrix = unit_scale;
-+ shift = 0;
-+ scale = 2; // We will shift right to kill this
-+ dc_scale = 1;
-+
-+ may_hide_sign = 0;
-+ }
-+
-+
-+
-+
-+ if (lc->cu.pred_mode == MODE_INTER && s->ps.sps->explicit_rdpcm_enabled_flag &&
-+ trans_skip_or_bypass) {
-+ explicit_rdpcm_flag = explicit_rdpcm_flag_decode(lc, c_idx_nz);
-+ if (explicit_rdpcm_flag) {
-+ may_hide_sign = 0;
-+ explicit_rdpcm_dir_flag = explicit_rdpcm_dir_flag_decode(lc, c_idx_nz);
-+ }
-+ }
-+
-+ last_significant_coeff_xy_prefix_decode(lc, c_idx_nz, log2_trafo_size,
-+ &last_significant_coeff_x, &last_significant_coeff_y);
-+
-+ if (last_significant_coeff_x > 3) {
-+ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_x);
-+ last_significant_coeff_x = (1 << ((last_significant_coeff_x >> 1) - 1)) *
-+ (2 + (last_significant_coeff_x & 1)) +
-+ suffix;
-+ }
-+
-+ if (last_significant_coeff_y > 3) {
-+ int suffix = last_significant_coeff_suffix_decode(lc, last_significant_coeff_y);
-+ last_significant_coeff_y = (1 << ((last_significant_coeff_y >> 1) - 1)) *
-+ (2 + (last_significant_coeff_y & 1)) +
-+ suffix;
-+ }
-+
-+ if (scan_idx == SCAN_VERT)
-+ FFSWAP(int, last_significant_coeff_x, last_significant_coeff_y);
-+
-+ x_cg_last_sig = last_significant_coeff_x >> 2;
-+ y_cg_last_sig = last_significant_coeff_y >> 2;
-+
-+ switch (scan_idx) {
-+ case SCAN_DIAG: {
-+ int last_x_c = last_significant_coeff_x & 3;
-+ int last_y_c = last_significant_coeff_y & 3;
-+
-+ num_coeff = diag_scan4x4_inv[last_y_c][last_x_c];
-+
-+ switch (log2_trafo_size) {
-+ case 2:
-+ scan_x_cg = scan_1x1;
-+ scan_y_cg = scan_1x1;
-+ break;
-+ case 3:
-+ num_coeff += diag_scan2x2_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+ scan_x_cg = diag_scan2x2_x;
-+ scan_y_cg = diag_scan2x2_y;
-+ break;
-+ case 4:
-+ num_coeff += diag_scan4x4_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+ scan_x_cg = ff_hevc_rpi_diag_scan4x4_x;
-+ scan_y_cg = ff_hevc_rpi_diag_scan4x4_y;
-+ break;
-+ case 5:
-+ default:
-+ num_coeff += diag_scan8x8_inv[y_cg_last_sig][x_cg_last_sig] << 4;
-+ scan_x_cg = ff_hevc_rpi_diag_scan8x8_x;
-+ scan_y_cg = ff_hevc_rpi_diag_scan8x8_y;
-+ break;
-+ }
-+ break;
-+ }
-+ case SCAN_HORIZ:
-+ scan_x_cg = horiz_scan2x2_x;
-+ scan_y_cg = horiz_scan2x2_y;
-+ num_coeff = horiz_scan8x8_inv[last_significant_coeff_y][last_significant_coeff_x];
-+ break;
-+ default: //SCAN_VERT
-+ scan_x_cg = horiz_scan2x2_y;
-+ scan_y_cg = horiz_scan2x2_x;
-+ num_coeff = horiz_scan8x8_inv[last_significant_coeff_x][last_significant_coeff_y];
-+ break;
-+ }
-+ num_coeff++;
-+ num_last_subset = (num_coeff - 1) >> 4;
-+
-+ significant_coeff_group_flag[y_cg_last_sig] = 1 << x_cg_last_sig; // 1st subset always significant
-+
-+ {
-+ const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+ const int special = trans_skip_or_bypass /* || lc->tu.cross_pf */; // These need special processing
-+ use_vpu = 0;
-+ use_dc = (num_coeff == 1) && !special &&
-+ !(lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2);
-+
-+ if (use_dc) {
-+ // Just need a little empty space
-+ coeffs = dummy_coeffs;
-+ // No need to clear
-+ }
-+ else
-+ {
-+ use_vpu = !special && log2_trafo_size >= 4;
-+#if RPI_COMPRESS_COEFFS
-+ use_compress = use_vpu && lc->jb0->coeffs.s[log2_trafo_size - 2].packed;
-+#endif
-+ coeffs = rpi_alloc_coeff_buf(lc->jb0, !use_vpu ? 0 : log2_trafo_size - 2, ccount);
-+#if RPI_COMPRESS_COEFFS
-+ coeffs32 = (int*)coeffs;
-+ if (!use_compress)
-+#endif
-+#if HAVE_NEON
-+ rpi_zap_coeff_vals_neon(coeffs, log2_trafo_size - 2);
-+#else
-+ memset(coeffs, 0, ccount * sizeof(int16_t));
-+#endif
-+ }
-+ }
-+
-+ i = num_last_subset;
-+ do {
-+ int implicit_non_zero_coeff = 0;
-+ int n_end;
-+
-+ uint8_t significant_coeff_flag_idx[16];
-+ unsigned int nb_significant_coeff_flag = 0;
-+
-+ if (i == num_last_subset) {
-+ // First time through
-+ int last_scan_pos = num_coeff - (i << 4) - 1;
-+ n_end = last_scan_pos - 1;
-+ significant_coeff_flag_idx[0] = last_scan_pos;
-+ nb_significant_coeff_flag = 1;
-+ } else {
-+ n_end = 15;
-+ implicit_non_zero_coeff = (i != 0);
-+ }
-+
-+ if (n_end >= 0) {
-+ static const uint8_t ctx_idx_maps_ts2[3][16] = {
-+ D4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+ H4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8), // log2_trafo_size == 2
-+ V4x4(0, 1, 4, 5, 2, 3, 4, 5, 6, 6, 8, 8, 7, 7, 8, 8) // log2_trafo_size == 2
-+ };
-+ // N.B. prev_sig = Right * 2 + Down
-+ static const uint8_t ctx_idx_maps[3][4][16] = {
-+ {
-+ D4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+ D4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+ D4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+ D4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
-+ },
-+ {
-+ H4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+ H4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+ H4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+ H4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
-+ },
-+ {
-+ V4x4(1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 0
-+ V4x4(2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0, 2, 1, 0, 0), // prev_sig == 1
-+ V4x4(2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0), // prev_sig == 2
-+ V4x4(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2) // prev_sig == 3, default
-+ }
-+ };
-+ const uint8_t *ctx_idx_map_p;
-+ int scf_offset = 0;
-+
-+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+ ctx_idx_map_p = ctx_idx_maps[0][3];
-+ scf_offset = 40 + c_idx_nz;
-+ } else {
-+ if (c_idx_nz != 0)
-+ scf_offset = 27;
-+
-+ if (log2_trafo_size == 2) {
-+ ctx_idx_map_p = ctx_idx_maps_ts2[scan_idx];
-+ } else {
-+ ctx_idx_map_p = ctx_idx_maps[scan_idx][prev_sig];
-+ if (!c_idx_nz) {
-+ if (i != 0)
-+ scf_offset += 3;
-+
-+ if (log2_trafo_size == 3) {
-+ scf_offset += (scan_idx == SCAN_DIAG) ? 9 : 15;
-+ } else {
-+ scf_offset += 21;
-+ }
-+ } else {
-+ if (log2_trafo_size == 3)
-+ scf_offset += 9;
-+ else
-+ scf_offset += 12;
-+ }
-+ }
-+ }
-+
-+ if (n_end > 0) {
-+ int cnt = get_sig_coeff_flag_idxs(&lc->cc,
-+ lc->cabac_state + elem_offset[SIGNIFICANT_COEFF_FLAG] + scf_offset,
-+ n_end, ctx_idx_map_p,
-+ significant_coeff_flag_idx + nb_significant_coeff_flag);
-+
-+ nb_significant_coeff_flag += cnt;
-+ if (cnt != 0) {
-+ implicit_non_zero_coeff = 0;
-+ }
-+ }
-+
-+ if (implicit_non_zero_coeff == 0) {
-+ if (s->ps.sps->transform_skip_context_enabled_flag && trans_skip_or_bypass) {
-+ scf_offset = 42 + c_idx_nz;
-+ } else {
-+ if (i == 0) {
-+ scf_offset = c_idx_nz ? 27 : 0;
-+ } else {
-+ scf_offset = 2 + scf_offset;
-+ }
-+ }
-+ if (significant_coeff_flag_decode_0(lc, scf_offset) == 1) {
-+ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+ nb_significant_coeff_flag++;
-+ }
-+ } else {
-+ significant_coeff_flag_idx[nb_significant_coeff_flag] = 0;
-+ nb_significant_coeff_flag++;
-+ }
-+ }
-+#if RPI_COMPRESS_COEFFS
-+ if (use_compress && (nb_significant_coeff_flag + num_nonzero + 1 >= (1<<(2*log2_trafo_size-1)))) { // Overflow when half-full!
-+ int16_t temp[32*32];
-+ const unsigned int ccount = 1 << (log2_trafo_size * 2);
-+ lc->jb0->coeffs.s[log2_trafo_size - 2].packed = 0;
-+ lc->jb0->coeffs.s[log2_trafo_size - 2].packed_n = lc->jb0->coeffs.s[log2_trafo_size - 2].n - ccount; // Don't want to unpack the last buffer
-+ memcpy(temp, coeffs, sizeof(int)*num_nonzero);
-+ coeffs32 = (int *)temp;
-+ memset(coeffs, 0, ccount * sizeof(int16_t));
-+ num_nonzero--;
-+ while (num_nonzero >= 0) {
-+ const unsigned int res = coeffs32[num_nonzero];
-+ const unsigned int offset = res & 0xffff;
-+ coeffs[ offset ] = res >> 16;
-+ num_nonzero--;
-+ }
-+ use_compress = 0;
-+ }
-+#endif
-+
-+ if (nb_significant_coeff_flag != 0) {
-+ const unsigned int gt1_idx_delta = (c_idx_nz << 2) |
-+ ((i != 0 && !c_idx_nz) ? 2 : 0) |
-+ prev_subset_coded;
-+ const unsigned int idx0_gt1 = elem_offset[COEFF_ABS_LEVEL_GREATER1_FLAG] +
-+ (gt1_idx_delta << 2);
-+ const unsigned int idx_gt2 = elem_offset[COEFF_ABS_LEVEL_GREATER2_FLAG] +
-+ gt1_idx_delta;
-+
-+ const unsigned int x_cg = scan_x_cg[i];
-+ const unsigned int y_cg = scan_y_cg[i];
-+ int16_t * const blk_coeffs = coeffs +
-+ ((x_cg + (y_cg << log2_trafo_size)) << 2);
-+ // This calculation is 'wrong' for log2_traffo_size == 2
-+ // but that doesn't matter as in this case x_cg & y_cg
-+ // are always 0 so result is correct (0) anyway
-+ const uint8_t * const blk_scale = scale_matrix +
-+ (((x_cg + (y_cg << 3)) << (5 - log2_trafo_size)));
-+
-+ // * The following code block doesn't deal with these flags:
-+ // (nor did the one it replaces)
-+ //
-+ // cabac_bypass_alignment_enabled_flag
-+ // This should be easy but I can't find a test case
-+ // extended_precision_processing_flag
-+ // This can extend the required precision past 16bits
-+ // so is probably tricky - also no example found yet
-+
-+#if USE_N_END_1
-+ if (nb_significant_coeff_flag == 1) {
-+ // There is a small gain to be had from special casing the single
-+ // transform coefficient case. The reduction in complexity
-+ // makes up for the code duplicatioon.
-+
-+ int trans_coeff_level = 1;
-+ int coeff_sign_flag;
-+ int coded_val = 0;
-+
-+ // initialize first elem of coeff_bas_level_greater1_flag
-+ prev_subset_coded = 0;
-+
-+ if (get_cabac(&lc->cc, lc->cabac_state + idx0_gt1 + 1)) {
-+ trans_coeff_level = 2;
-+ prev_subset_coded = 1;
-+ coded_val = get_cabac(&lc->cc, lc->cabac_state + idx_gt2);
-+ }
-+
-+ // Probably not worth the overhead of starting by22 for just one value
-+ coeff_sign_flag = get_cabac_bypass(&lc->cc);
-+
-+ if (coded_val)
-+ {
-+ if (!s->ps.sps->persistent_rice_adaptation_enabled_flag) {
-+ trans_coeff_level = 3 + coeff_abs_level_remaining_decode(&lc->cc, 0);
-+ } else {
-+ uint8_t * const stat_coeff =
-+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+ const unsigned int c_rice_param = *stat_coeff >> 2;
-+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode(&lc->cc, c_rice_param);
-+
-+ trans_coeff_level = 3 + last_coeff_abs_level_remaining;
-+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+ }
-+ }
-+
-+ {
-+ const xy_off_t * const xy_off = scan_xy_off + significant_coeff_flag_idx[0];
-+ const int k = (int32_t)(coeff_sign_flag << 31) >> 31;
-+ const unsigned int scale_m = blk_scale[xy_off->scale];
-+ const int res = trans_scale_sat(
-+ (trans_coeff_level ^ k) - k, // Apply sign
-+ scale,
-+ i == 0 && xy_off->coeff == 0 ? dc_scale : scale_m,
-+ shift);
-+#if RPI_COMPRESS_COEFFS
-+ if (use_compress)
-+ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+ else
-+#endif
-+ blk_coeffs[xy_off->coeff] = res;
-+ }
-+ }
-+ else
-+#endif
-+ {
-+ int sign_hidden = may_hide_sign;
-+ int levels[16]; // Should be able to get away with int16_t but that fails some tests
-+ uint32_t coeff_sign_flags;
-+ uint32_t coded_vals = 0;
-+ // Sum(abs(level[]))
-+ // In fact we only need the bottom bit and in some future
-+ // version that may be all we calculate
-+ unsigned int sum_abs;
-+
-+ coded_vals = get_greaterx_bits(lc, nb_significant_coeff_flag, levels,
-+ &prev_subset_coded, &sum_abs, idx0_gt1, idx_gt2);
-+
-+ if (significant_coeff_flag_idx[0] - significant_coeff_flag_idx[nb_significant_coeff_flag - 1] <= 3)
-+ sign_hidden = 0;
-+
-+ // -- Start bypass block
-+
-+ bypass_start(&lc->cc);
-+
-+ coeff_sign_flags = coeff_sign_flag_decode_bypass(&lc->cc, nb_significant_coeff_flag - sign_hidden);
-+
-+ if (coded_vals != 0)
-+ {
-+ const int rice_adaptation_enabled = s->ps.sps->persistent_rice_adaptation_enabled_flag;
-+ uint8_t * stat_coeff = !rice_adaptation_enabled ? NULL :
-+ lc->stat_coeff + trans_skip_or_bypass + 2 - ((c_idx_nz) << 1);
-+ int c_rice_param = !rice_adaptation_enabled ? 0 : *stat_coeff >> 2;
-+ int * level = levels - 1;
-+
-+ do {
-+ {
-+ const unsigned int z = hevc_clz32(coded_vals) + 1;
-+ level += z;
-+ coded_vals <<= z;
-+ }
-+
-+ {
-+ const int last_coeff_abs_level_remaining = coeff_abs_level_remaining_decode_bypass(&lc->cc, c_rice_param);
-+ const int trans_coeff_level = *level + last_coeff_abs_level_remaining + 1;
-+
-+ sum_abs += last_coeff_abs_level_remaining + 1;
-+ *level = trans_coeff_level;
-+
-+ if (stat_coeff != NULL)
-+ update_rice(stat_coeff, last_coeff_abs_level_remaining, c_rice_param);
-+ stat_coeff = NULL;
-+
-+ if (trans_coeff_level > (3 << c_rice_param) &&
-+ (c_rice_param < 4 || rice_adaptation_enabled))
-+ ++c_rice_param;
-+ }
-+ } while (coded_vals != 0);
-+ }
-+
-+ // sign_hidden = 0 or 1 so we can combine the tests
-+ if ((sign_hidden & sum_abs) != 0) {
-+ levels[nb_significant_coeff_flag - 1] = -levels[nb_significant_coeff_flag - 1];
-+ }
-+
-+ bypass_finish(&lc->cc);
-+
-+ // -- Finish bypass block
-+
-+ // Scale loop
-+ {
-+ int m = nb_significant_coeff_flag - 1;
-+
-+ // Deal with DC component (if any) first
-+ if (i == 0 && significant_coeff_flag_idx[m] == 0)
-+ {
-+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+ const int res = trans_scale_sat(
-+ (levels[m] ^ k) - k, scale, dc_scale, shift);
-+#if RPI_COMPRESS_COEFFS
-+ if (use_compress)
-+ {
-+ coeffs32[num_nonzero++] = (res<<16) + (blk_coeffs - coeffs);
-+ }
-+ else
-+#endif
-+ {
-+ blk_coeffs[0] = res;
-+ }
-+ --m;
-+ }
-+
-+#if !USE_N_END_1
-+ // If N_END_1 set then m was at least 1 initially
-+ if (m >= 0)
-+#endif
-+ {
-+ do {
-+ const xy_off_t * const xy_off = scan_xy_off +
-+ significant_coeff_flag_idx[m];
-+ const int k = (int32_t)(coeff_sign_flags << m) >> 31;
-+ const int res = trans_scale_sat(
-+ (levels[m] ^ k) - k,
-+ scale,
-+ blk_scale[xy_off->scale],
-+ shift);
-+#if RPI_COMPRESS_COEFFS
-+ if (use_compress) {
-+ coeffs32[num_nonzero++] = (res<<16) + (&blk_coeffs[xy_off->coeff] - coeffs);
-+ } else
-+#endif
-+ blk_coeffs[xy_off->coeff] = res;
-+ } while (--m >= 0);
-+ }
-+ }
-+
-+ }
-+ }
-+ } while ((i = next_subset(lc, i, c_idx_nz,
-+ significant_coeff_group_flag, scan_x_cg, scan_y_cg, &prev_sig)) >= 0 &&
-+ !cabac_overflow(&lc->cc));
-+
-+ if (lc->cu.cu_transquant_bypass_flag) {
-+ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+ int mode = s->ps.sps->implicit_rdpcm_enabled_flag ? (pred_mode_intra == 26) : explicit_rdpcm_dir_flag;
-+
-+ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+ }
-+ } else {
-+ if (trans_skip_or_bypass) { // Must be trans_skip as we've already dealt with bypass
-+ int rot = s->ps.sps->transform_skip_rotation_enabled_flag &&
-+ log2_trafo_size == 2 &&
-+ lc->cu.pred_mode == MODE_INTRA;
-+ if (rot) {
-+ for (i = 0; i < 8; i++)
-+ FFSWAP(int16_t, coeffs[i], coeffs[16 - i - 1]);
-+ }
-+
-+ s->hevcdsp.dequant(coeffs, log2_trafo_size);
-+
-+ if (explicit_rdpcm_flag || (s->ps.sps->implicit_rdpcm_enabled_flag &&
-+ lc->cu.pred_mode == MODE_INTRA &&
-+ (pred_mode_intra == 10 || pred_mode_intra == 26))) {
-+ int mode = explicit_rdpcm_flag ? explicit_rdpcm_dir_flag : (pred_mode_intra == 26);
-+
-+ s->hevcdsp.transform_rdpcm(coeffs, log2_trafo_size, mode);
-+ }
-+ } else if (lc->cu.pred_mode == MODE_INTRA && c_idx == 0 && log2_trafo_size == 2) {
-+ s->hevcdsp.transform_4x4_luma(coeffs);
-+ }
-+ else if (!use_vpu)
-+ {
-+ int max_xy = FFMAX(last_significant_coeff_x, last_significant_coeff_y);
-+ if (max_xy == 0)
-+ {
-+ if (use_dc)
-+ rpi_add_dc(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+ else
-+ s->hevcdsp.idct_dc[log2_trafo_size - 2](coeffs);
-+ }
-+ else {
-+ int col_limit = last_significant_coeff_x + last_significant_coeff_y + 4;
-+ if (max_xy < 4)
-+ col_limit = FFMIN(4, col_limit);
-+ else if (max_xy < 8)
-+ col_limit = FFMIN(8, col_limit);
-+ else if (max_xy < 12)
-+ col_limit = FFMIN(24, col_limit);
-+ s->hevcdsp.idct[log2_trafo_size - 2](coeffs, col_limit);
-+ }
-+ }
-+ }
-+
-+#if 0
-+ // Mildly rotted - we support no mode where cross is valid
-+ if (lc->tu.cross_pf) {
-+ int16_t * const coeffs_y = (int16_t*)lc->edge_emu_buffer;
-+ const int ccount = 1 << (log2_trafo_size * 2);
-+
-+ for (i = 0; i < ccount; i++) {
-+ coeffs[i] = coeffs[i] + ((lc->tu.res_scale_val * coeffs_y[i]) >> 3);
-+ }
-+ }
-+#endif
-+
-+ if (!use_dc) {
-+#if RPI_COMPRESS_COEFFS
-+ if (use_compress) {
-+ coeffs32[num_nonzero] = 0;
-+ }
-+#endif
-+ rpi_add_residual(s, lc->jb0, log2_trafo_size, c_idx, x0, y0, coeffs);
-+ }
-+}
-+
-+#if !USE_BY22
-+// Stores results to lc
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+ int x = abs_mvd_greater0_flag_decode(lc);
-+ int y = abs_mvd_greater0_flag_decode(lc);
-+
-+ if (x)
-+ x += abs_mvd_greater1_flag_decode(lc);
-+ if (y)
-+ y += abs_mvd_greater1_flag_decode(lc);
-+
-+ switch (x) {
-+ case 2: x = mvd_decode(lc); break;
-+ case 1: x = mvd_sign_flag_decode(lc); break;
-+ case 0: x = 0; break;
-+ }
-+
-+ switch (y) {
-+ case 2: y = mvd_decode(lc); break;
-+ case 1: y = mvd_sign_flag_decode(lc); break;
-+ case 0: y = 0; break;
-+ }
-+ return MV_XY(x,y);
-+}
-+#else
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc)
-+{
-+ int x = abs_mvd_greater0_flag_decode(lc);
-+ int y = abs_mvd_greater0_flag_decode(lc);
-+
-+ if ((x | y) == 0)
-+ return 0;
-+
-+ if (x != 0)
-+ x += abs_mvd_greater1_flag_decode(lc);
-+ if (y != 0)
-+ y += abs_mvd_greater1_flag_decode(lc);
-+
-+ if ((x | y) == 1)
-+ {
-+ // Not worth starting BY22
-+ if (x != 0)
-+ x = mvd_sign_flag_decode(lc);
-+ if (y != 0)
-+ y = mvd_sign_flag_decode(lc);
-+ }
-+ else
-+ {
-+ CABACContext * const cc = &lc->cc;
-+ uint32_t val;
-+ uint32_t b;
-+ unsigned int n = 0;
-+
-+ bypass_start(cc);
-+ b = val = get_cabac_by22_peek(cc);
-+
-+ if (x == 1) {
-+ x = ((int32_t)b >> 31) | 1;
-+ n = 1;
-+ b <<= 1;
-+ }
-+ else if (x == 2) {
-+ // EG1 so we have (leading one bits + 1) of suffix
-+ // This makes prefix & suffix lengths the same
-+ const unsigned int k = hevc_clz32(~b) + 1;
-+ int s;
-+
-+ av_assert2(k <= 15);
-+
-+ b <<= k;
-+ n = 2 * k + 1; // Includes suffix & sign
-+
-+ // We need to have k*2 + 2 (prefix, suffix, sign, y-sign) bits peeked
-+ // if we are going to do this without a flush
-+ if (k > CABAC_BY22_PEEK_BITS / 2 - 1)
-+ {
-+ // Need too many bits - flush
-+ // n = k
-+ get_cabac_by22_flush(cc, k, val);
-+ b = val = get_cabac_by22_peek(cc);
-+ n = k + 1;
-+ }
-+
-+ x = (b >> (32 - k)) + (1 << k);
-+ b <<= k;
-+ s = (int32_t)b >> 31;
-+ x = (x ^ s) - s;
-+ b <<= 1;
-+
-+ // Max abs value of an mv is 2^15 - 1 (i.e. a prefix len of 15 bits)
-+ if (y > 1 && n > CABAC_BY22_PEEK_BITS - 15)
-+ {
-+ get_cabac_by22_flush(cc, n, val);
-+ b = val = get_cabac_by22_peek(cc);
-+ n = 0;
-+ }
-+ }
-+
-+ if (y == 1) {
-+ y = ((int32_t)b >> 31) | 1;
-+ ++n;
-+ // don't care about b anymore
-+ }
-+ else if (y == 2) {
-+ const unsigned int k = hevc_clz32(~b) + 1;
-+ int s;
-+
-+ av_assert2(k <= 15);
-+
-+ // We need to have k*2 + 1 (prefix, suffix, sign) bits peeked
-+ // if we are going to do this without a flush
-+ b <<= k;
-+ n += 2 * k + 1;
-+
-+ if (n > CABAC_BY22_PEEK_BITS)
-+ {
-+ // Need too many bits - flush
-+ get_cabac_by22_flush(cc, n - (k + 1), val);
-+ b = val = get_cabac_by22_peek(cc);
-+ n = k + 1;
-+ }
-+
-+ y = (b >> (32 - k)) + (1 << k);
-+ s = (int32_t)(b << k) >> 31;
-+ y = (y ^ s) - s;
-+ // don't care about b anymore
-+ }
-+
-+ get_cabac_by22_flush(cc, n, val);
-+ bypass_finish(cc);
-+ }
-+
-+ return MV_XY(x, y);
-+}
-+#endif
-diff --git a/libavcodec/rpi_hevc_cabac_fns.h b/libavcodec/rpi_hevc_cabac_fns.h
-new file mode 100644
-index 0000000000..a6587616ae
---- /dev/null
-+++ b/libavcodec/rpi_hevc_cabac_fns.h
-@@ -0,0 +1,191 @@
-+#ifndef AVCODEC_RPI_HEVC_CABAC_FNS_H
-+#define AVCODEC_RPI_HEVC_CABAC_FNS_H
-+
-+#include "config.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_save_states(HEVCRpiContext *s, const HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_init_decoder(HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_cabac_init(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, const unsigned int ctb_flags);
-+int ff_hevc_rpi_sao_type_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_band_position_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_abs_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_offset_sign_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_sao_eo_class_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_part_mode_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int log2_cb_size);
-+int ff_hevc_rpi_mpm_idx_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_rem_intra_luma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_intra_chroma_pred_mode_decode(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_merge_idx_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_inter_pred_idc_decode(HEVCRpiLocalContext * const lc, int nPbW, int nPbH);
-+int ff_hevc_rpi_ref_idx_lx_decode(HEVCRpiLocalContext * const lc, const int num_ref_idx_lx);
-+int ff_hevc_rpi_log2_res_scale_abs(HEVCRpiLocalContext * const lc, const int idx);
-+
-+//int ff_hevc_rpi_cu_qp_delta_sign_flag(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_qp_delta(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cu_chroma_qp_offset_idx(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc);
-+void ff_hevc_rpi_hls_residual_coding(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const int x0, const int y0,
-+ const int log2_trafo_size, const enum ScanType scan_idx,
-+ const int c_idx);
-+
-+MvXY ff_hevc_rpi_hls_mvd_coding(HEVCRpiLocalContext * const lc);
-+int ff_hevc_rpi_cabac_overflow(const HEVCRpiLocalContext * const lc);
-+
-+#define HEVC_BIN_SAO_MERGE_FLAG 0
-+#define HEVC_BIN_SAO_TYPE_IDX 1
-+#define HEVC_BIN_SAO_EO_CLASS 2
-+#define HEVC_BIN_SAO_BAND_POSITION 2
-+#define HEVC_BIN_SAO_OFFSET_ABS 2
-+#define HEVC_BIN_SAO_OFFSET_SIGN 2
-+#define HEVC_BIN_END_OF_SLICE_FLAG 2
-+#define HEVC_BIN_SPLIT_CODING_UNIT_FLAG 2
-+#define HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG 5
-+#define HEVC_BIN_SKIP_FLAG 6
-+#define HEVC_BIN_CU_QP_DELTA 9
-+#define HEVC_BIN_PRED_MODE 12
-+#define HEVC_BIN_PART_MODE 13
-+#define HEVC_BIN_PCM_FLAG 17
-+#define HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE 17
-+#define HEVC_BIN_MPM_IDX 18
-+#define HEVC_BIN_REM_INTRA_LUMA_PRED_MODE 18
-+#define HEVC_BIN_INTRA_CHROMA_PRED_MODE 18
-+#define HEVC_BIN_MERGE_FLAG 20
-+#define HEVC_BIN_MERGE_IDX 21
-+#define HEVC_BIN_INTER_PRED_IDC 22
-+#define HEVC_BIN_REF_IDX_L0 27
-+#define HEVC_BIN_REF_IDX_L1 29
-+#define HEVC_BIN_ABS_MVD_GREATER0_FLAG 31
-+#define HEVC_BIN_ABS_MVD_GREATER1_FLAG 33
-+#define HEVC_BIN_ABS_MVD_MINUS2 35
-+#define HEVC_BIN_MVD_SIGN_FLAG 35
-+#define HEVC_BIN_MVP_LX_FLAG 35
-+#define HEVC_BIN_NO_RESIDUAL_DATA_FLAG 36
-+#define HEVC_BIN_SPLIT_TRANSFORM_FLAG 37
-+#define HEVC_BIN_CBF_LUMA 40
-+#define HEVC_BIN_CBF_CB_CR 42
-+#define HEVC_BIN_TRANSFORM_SKIP_FLAG 46
-+#define HEVC_BIN_EXPLICIT_RDPCM_FLAG 48
-+#define HEVC_BIN_EXPLICIT_RDPCM_DIR_FLAG 50
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_PREFIX 52
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_PREFIX 70
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_X_SUFFIX 88
-+#define HEVC_BIN_LAST_SIGNIFICANT_COEFF_Y_SUFFIX 88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_GROUP_FLAG 88
-+#define HEVC_BIN_SIGNIFICANT_COEFF_FLAG 92
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER1_FLAG 136
-+#define HEVC_BIN_COEFF_ABS_LEVEL_GREATER2_FLAG 160
-+#define HEVC_BIN_COEFF_ABS_LEVEL_REMAINING 166
-+#define HEVC_BIN_COEFF_SIGN_FLAG 166
-+#define HEVC_BIN_LOG2_RES_SCALE_ABS 166
-+#define HEVC_BIN_RES_SCALE_SIGN_FLAG 174
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG 176
-+#define HEVC_BIN_CU_CHROMA_QP_OFFSET_IDX 177
-+
-+
-+int ff_hevc_rpi_get_cabac(CABACContext * const c, uint8_t * const state);
-+int ff_hevc_rpi_get_cabac_terminate(CABACContext * const c);
-+
-+static inline const uint8_t* ff_hevc_rpi_cabac_skip_bytes(CABACContext * const c, int n) {
-+ const uint8_t *ptr = c->bytestream;
-+
-+ if (c->low & 0x1)
-+ ptr--;
-+#if CABAC_BITS == 16
-+ if (c->low & 0x1FF)
-+ ptr--;
-+#endif
-+ if ((int) (c->bytestream_end - ptr) < n)
-+ return NULL;
-+ if (ff_init_cabac_decoder(c, ptr + n, c->bytestream_end - ptr - n) < 0)
-+ return NULL;
-+
-+ return ptr;
-+}
-+
-+static inline int ff_hevc_rpi_sao_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SAO_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_transquant_bypass_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_TRANSQUANT_BYPASS_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cu_chroma_qp_offset_flag(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CU_CHROMA_QP_OFFSET_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_split_coding_unit_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int ct_depth,
-+ const unsigned int x0, const unsigned int y0)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_CODING_UNIT_FLAG +
-+ ((s->cabac_stash_left[y0 >> 3] >> 1) > ct_depth) +
-+ ((s->cabac_stash_up[x0 >> 3] >> 1) > ct_depth));
-+}
-+
-+static inline int ff_hevc_rpi_skip_flag_decode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const int x0, const int y0, const int x_cb, const int y_cb)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SKIP_FLAG +
-+ (s->cabac_stash_left[y0 >> 3] & 1) +
-+ (s->cabac_stash_up[x0 >> 3] & 1));
-+}
-+
-+static inline int ff_hevc_rpi_pred_mode_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_pcm_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+}
-+
-+static inline int ff_hevc_rpi_prev_intra_luma_pred_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_PREV_INTRA_LUMA_PRED_MODE);
-+}
-+
-+static inline int ff_hevc_rpi_merge_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MERGE_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_mvp_lx_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_MVP_LX_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_no_residual_syntax_flag_decode(HEVCRpiLocalContext * const lc)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_NO_RESIDUAL_DATA_FLAG);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_cb_cr_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_CB_CR + trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_cbf_luma_decode(HEVCRpiLocalContext * const lc, const int trafo_depth)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_CBF_LUMA + !trafo_depth);
-+}
-+
-+static inline int ff_hevc_rpi_split_transform_flag_decode(HEVCRpiLocalContext * const lc, const int log2_trafo_size)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_SPLIT_TRANSFORM_FLAG + 5 - log2_trafo_size);
-+}
-+
-+static inline int ff_hevc_rpi_res_scale_sign_flag(HEVCRpiLocalContext *const lc, const int idx)
-+{
-+ return ff_hevc_rpi_get_cabac(&lc->cc, lc->cabac_state + HEVC_BIN_RES_SCALE_SIGN_FLAG + idx);
-+}
-+
-+
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_data.c b/libavcodec/rpi_hevc_data.c
-new file mode 100644
-index 0000000000..341bb77d9d
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.c
-@@ -0,0 +1,75 @@
-+/*
-+ * HEVC shared tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_data.h"
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_x[16] = {
-+ 0, 0, 1, 0,
-+ 1, 2, 0, 1,
-+ 2, 3, 1, 2,
-+ 3, 2, 3, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan4x4_y[16] = {
-+ 0, 1, 0, 2,
-+ 1, 0, 3, 2,
-+ 1, 0, 3, 2,
-+ 1, 3, 2, 3,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_x[64] = {
-+ 0, 0, 1, 0,
-+ 1, 2, 0, 1,
-+ 2, 3, 0, 1,
-+ 2, 3, 4, 0,
-+ 1, 2, 3, 4,
-+ 5, 0, 1, 2,
-+ 3, 4, 5, 6,
-+ 0, 1, 2, 3,
-+ 4, 5, 6, 7,
-+ 1, 2, 3, 4,
-+ 5, 6, 7, 2,
-+ 3, 4, 5, 6,
-+ 7, 3, 4, 5,
-+ 6, 7, 4, 5,
-+ 6, 7, 5, 6,
-+ 7, 6, 7, 7,
-+};
-+
-+const uint8_t ff_hevc_rpi_diag_scan8x8_y[64] = {
-+ 0, 1, 0, 2,
-+ 1, 0, 3, 2,
-+ 1, 0, 4, 3,
-+ 2, 1, 0, 5,
-+ 4, 3, 2, 1,
-+ 0, 6, 5, 4,
-+ 3, 2, 1, 0,
-+ 7, 6, 5, 4,
-+ 3, 2, 1, 0,
-+ 7, 6, 5, 4,
-+ 3, 2, 1, 7,
-+ 6, 5, 4, 3,
-+ 2, 7, 6, 5,
-+ 4, 3, 7, 6,
-+ 5, 4, 7, 6,
-+ 5, 7, 6, 7,
-+};
-diff --git a/libavcodec/rpi_hevc_data.h b/libavcodec/rpi_hevc_data.h
-new file mode 100644
-index 0000000000..0aee673d8b
---- /dev/null
-+++ b/libavcodec/rpi_hevc_data.h
-@@ -0,0 +1,31 @@
-+/*
-+ * HEVC shared data tables
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_DATA_H
-+#define AVCODEC_RPI_HEVC_DATA_H
-+
-+#include <stdint.h>
-+
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_x[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan4x4_y[16];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_x[64];
-+extern const uint8_t ff_hevc_rpi_diag_scan8x8_y[64];
-+
-+#endif /* AVCODEC_RPI_HEVC_DATA_H */
-diff --git a/libavcodec/rpi_hevc_filter.c b/libavcodec/rpi_hevc_filter.c
-new file mode 100644
-index 0000000000..dd5f65b5c4
---- /dev/null
-+++ b/libavcodec/rpi_hevc_filter.c
-@@ -0,0 +1,1206 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Seppo Tomperi
-+ * Copyright (C) 2013 Wassim Hamidouche
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+//#define DISABLE_SAO
-+//#define DISABLE_DEBLOCK
-+//#define DISABLE_STRENGTHS
-+// define DISABLE_DEBLOCK_NONREF for a 6% speed boost (by skipping deblocking on unimportant frames)
-+//#define DISABLE_DEBLOCK_NONREF
-+
-+#include "libavutil/common.h"
-+#include "libavutil/internal.h"
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+
-+#include "rpi_qpu.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#define LUMA 0
-+#define CB 1
-+#define CR 2
-+
-+// tcoffset: -12,12; qp: 0,51; (bs-1)*2: 0,2
-+// so -12,75 overall
-+static const uint8_t tctablex[] = {
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-+
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, // QP 0...18
-+ 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, // QP 19...37
-+ 5, 5, 6, 6, 7, 8, 9, 10, 11, 13, 14, 16, 18, 20, 22, 24, // QP 38...53
-+ 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24 // 54..75
-+};
-+#define tctable (tctablex + 12 + 6*8)
-+
-+static const uint8_t betatablex[] = {
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -ve quant padding
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-+
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // -12..-1
-+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8, // QP 0...18
-+ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, // QP 19...37
-+ 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, // QP 38...51
-+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 // 52..73
-+};
-+#define betatable (betatablex + 12 + 6*8)
-+
-+static inline int chroma_tc(const HEVCRpiContext * const s, const int qp_y,
-+ const int c_idx, const int tc_offset)
-+{
-+ return tctable[(int)s->ps.pps->qp_dblk_x[c_idx][qp_y] + tc_offset + 2];
-+}
-+
-+static inline int get_qPy_pred(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int xBase, const unsigned int yBase)
-+{
-+ const unsigned int ctb_size_mask = (1 << s->ps.sps->log2_ctb_size) - 1;
-+ const unsigned int MinCuQpDeltaSizeMask = ~0U << s->ps.pps->log2_min_cu_qp_delta_size;
-+ const unsigned int xQgBase = xBase & MinCuQpDeltaSizeMask;
-+ const unsigned int yQgBase = yBase & MinCuQpDeltaSizeMask;
-+ const unsigned int min_cb_width = s->ps.sps->min_cb_width;
-+ const unsigned int x_cb = xQgBase >> s->ps.sps->log2_min_cb_size;
-+ const unsigned int y_cb = yQgBase >> s->ps.sps->log2_min_cb_size;
-+ const int qPy_pred = lc->qPy_pred;
-+
-+ return (((xQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+ s->qp_y_tab[(x_cb - 1) + y_cb * min_cb_width]) +
-+ ((yQgBase & ctb_size_mask) == 0 ? qPy_pred :
-+ s->qp_y_tab[x_cb + (y_cb - 1) * min_cb_width]) + 1) >> 1;
-+}
-+
-+// * Only called from bitstream decode in foreground
-+// so should be safe
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase)
-+{
-+ const int qp_y = get_qPy_pred(s, lc, xBase, yBase);
-+
-+ if (lc->tu.cu_qp_delta != 0) {
-+ // ?? I suspect that the -bd_offset here leads to us adding it elsewhere
-+ int off = s->ps.sps->qp_bd_offset;
-+ lc->qp_y = FFUMOD(qp_y + lc->tu.cu_qp_delta + 52 + 2 * off,
-+ 52 + off) - off;
-+ } else
-+ lc->qp_y = qp_y;
-+}
-+
-+static inline unsigned int pixel_shift(const HEVCRpiContext * const s, const unsigned int c_idx)
-+{
-+ return c_idx != 0 ? 1 + s->ps.sps->pixel_shift : s->ps.sps->pixel_shift;
-+}
-+
-+// "DSP" these?
-+static void copy_pixel(uint8_t *dst, const uint8_t *src, int pixel_shift)
-+{
-+ switch (pixel_shift)
-+ {
-+ case 2:
-+ *(uint32_t *)dst = *(uint32_t *)src;
-+ break;
-+ case 1:
-+ *(uint16_t *)dst = *(uint16_t *)src;
-+ break;
-+ default:
-+ *dst = *src;
-+ break;
-+ }
-+}
-+
-+static void copy_CTB_to_hv(const HEVCRpiContext * const s, const uint8_t * const src,
-+ ptrdiff_t stride_src, int x, int y, int width, int height,
-+ int c_idx, int x_ctb, int y_ctb)
-+{
-+ const unsigned int sh = pixel_shift(s, c_idx);
-+ const unsigned int w = s->ps.sps->width >> ctx_hshift(s, c_idx);
-+ const unsigned int h = s->ps.sps->height >> ctx_vshift(s, c_idx);
-+
-+ /* copy horizontal edges */
-+ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb) * w + x) << sh),
-+ src, width << sh);
-+ memcpy(s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 1) * w + x) << sh),
-+ src + stride_src * (height - 1), width << sh);
-+
-+ /* copy vertical edges */
-+ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb) * h + y) << sh), src, sh, height, 1 << sh, stride_src);
-+
-+ ff_hevc_rpi_copy_vert(s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 1) * h + y) << sh), src + ((width - 1) << sh), sh, height, 1 << sh, stride_src);
-+}
-+
-+// N.B. Src & dst are swapped as this is a restore!
-+// x0 & y0 are in luma coords
-+// Width & height are in Y/C pels as appropriate
-+// * Clear scope for optimsation here but not used enough to be worth it
-+static void restore_tqb_pixels(const HEVCRpiContext * const s,
-+ uint8_t *src1, const uint8_t *dst1,
-+ const ptrdiff_t stride_src, const ptrdiff_t stride_dst,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int width, const int height,
-+ const int c_idx)
-+{
-+ if (s->ps.pps->transquant_bypass_enable_flag ||
-+ s->ps.sps->pcm.loop_filter_disable_flag)
-+ {
-+ const uint8_t *pcm = s->is_pcm + (x0 >> 6) + (y0 >> 3) * s->ps.sps->pcm_width;
-+ int blks_y = height >> (c_idx == 0 ? 3 : 2);
-+ const unsigned int bwidth = 8 << s->ps.sps->pixel_shift; // Y & C have the same width in sand
-+ const unsigned int bheight = (c_idx == 0) ? 8 : 4;
-+ const unsigned int sh = ((x0 >> 3) & 7);
-+ const unsigned int mask = (1 << (width >> (c_idx == 0 ? 3 : 2))) - 1;
-+
-+ do {
-+ unsigned int m = (*pcm >> sh) & mask;
-+ uint8_t * bd = src1;
-+ const uint8_t * bs = dst1;
-+ while (m != 0) {
-+ if ((m & 1) != 0) {
-+ s->hevcdsp.cpy_blk(bd, stride_src, bs, stride_dst, bwidth, bheight);
-+ }
-+ m >>= 1;
-+ bs += bwidth;
-+ bd += bwidth;
-+ }
-+ src1 += stride_src * bheight;
-+ dst1 += stride_dst * bheight;
-+ pcm += s->ps.sps->pcm_width;
-+ } while (--blks_y > 0);
-+ }
-+}
-+
-+#define CTB(tab, x, y) ((tab)[(y) * s->ps.sps->ctb_width + (x)])
-+
-+static void sao_filter_CTB(const HEVCRpiContext * const s, const int x, const int y)
-+{
-+#if SAO_FILTER_N == 5
-+ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 2 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#elif SAO_FILTER_N == 6
-+ static const uint8_t sao_tab[8] = { 0 /* 8 */, 1 /* 16 */, 5 /* 24 */, 2 /* 32 */, 3, 3 /* 48 */, 4, 4 /* 64 */};
-+#else
-+#error Confused by size of sao fn array
-+#endif
-+ int c_idx;
-+ int edges[4]; // 0 left 1 top 2 right 3 bottom
-+ int x_ctb = x >> s->ps.sps->log2_ctb_size;
-+ int y_ctb = y >> s->ps.sps->log2_ctb_size;
-+ int ctb_addr_rs = y_ctb * s->ps.sps->ctb_width + x_ctb;
-+ int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs];
-+ RpiSAOParams *sao = &CTB(s->sao, x_ctb, y_ctb);
-+ // flags indicating unfilterable edges
-+ uint8_t vert_edge[] = { 0, 0 };
-+ uint8_t horiz_edge[] = { 0, 0 };
-+ uint8_t diag_edge[] = { 0, 0, 0, 0 };
-+ uint8_t lfase = CTB(s->filter_slice_edges, x_ctb, y_ctb);
-+ uint8_t no_tile_filter = s->ps.pps->tiles_enabled_flag &&
-+ !s->ps.pps->loop_filter_across_tiles_enabled_flag;
-+ uint8_t restore = no_tile_filter || !lfase;
-+ uint8_t left_tile_edge = 0;
-+ uint8_t right_tile_edge = 0;
-+ uint8_t up_tile_edge = 0;
-+ uint8_t bottom_tile_edge = 0;
-+ const int sliced = 1;
-+ const int plane_count = sliced ? 2 : (ctx_cfmt(s) != 0 ? 3 : 1);
-+
-+ edges[0] = x_ctb == 0;
-+ edges[1] = y_ctb == 0;
-+ edges[2] = x_ctb == s->ps.sps->ctb_width - 1;
-+ edges[3] = y_ctb == s->ps.sps->ctb_height - 1;
-+
-+#ifdef DISABLE_SAO
-+ return;
-+#endif
-+
-+ if (restore) {
-+ if (!edges[0]) {
-+ left_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs-1]];
-+ vert_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb)) || left_tile_edge;
-+ }
-+ if (!edges[2]) {
-+ right_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs+1]];
-+ vert_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb)) || right_tile_edge;
-+ }
-+ if (!edges[1]) {
-+ up_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs - s->ps.sps->ctb_width]];
-+ horiz_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb - 1)) || up_tile_edge;
-+ }
-+ if (!edges[3]) {
-+ bottom_tile_edge = no_tile_filter && s->ps.pps->tile_id[ctb_addr_ts] != s->ps.pps->tile_id[s->ps.pps->ctb_addr_rs_to_ts[ctb_addr_rs + s->ps.sps->ctb_width]];
-+ horiz_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb, y_ctb + 1)) || bottom_tile_edge;
-+ }
-+ if (!edges[0] && !edges[1]) {
-+ diag_edge[0] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb - 1)) || left_tile_edge || up_tile_edge;
-+ }
-+ if (!edges[1] && !edges[2]) {
-+ diag_edge[1] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb - 1)) || right_tile_edge || up_tile_edge;
-+ }
-+ if (!edges[2] && !edges[3]) {
-+ diag_edge[2] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb + 1, y_ctb + 1)) || right_tile_edge || bottom_tile_edge;
-+ }
-+ if (!edges[0] && !edges[3]) {
-+ diag_edge[3] = (!lfase && CTB(s->tab_slice_address, x_ctb, y_ctb) != CTB(s->tab_slice_address, x_ctb - 1, y_ctb + 1)) || left_tile_edge || bottom_tile_edge;
-+ }
-+ }
-+
-+ for (c_idx = 0; c_idx < plane_count; c_idx++) {
-+ const unsigned int vshift = ctx_vshift(s, c_idx);
-+ const unsigned int hshift = ctx_hshift(s, c_idx);
-+ const int x0 = x >> hshift;
-+ const int y0 = y >> vshift;
-+ const ptrdiff_t stride_src = frame_stride1(s->frame, c_idx);
-+ const int ctb_size_h = (1 << (s->ps.sps->log2_ctb_size)) >> hshift;
-+ const int ctb_size_v = (1 << (s->ps.sps->log2_ctb_size)) >> vshift;
-+ const int width = FFMIN(ctb_size_h, (s->ps.sps->width >> hshift) - x0);
-+ const int height = FFMIN(ctb_size_v, (s->ps.sps->height >> vshift) - y0);
-+ int tab = sao_tab[(FFALIGN(width, 8) >> 3) - 1];
-+ ptrdiff_t stride_dst;
-+ uint8_t *dst;
-+
-+ const unsigned int sh = s->ps.sps->pixel_shift + (sliced && c_idx != 0);
-+ const int wants_lr = sao->type_idx[c_idx] == SAO_EDGE && sao->eo_class[c_idx] != 1 /* Vertical */;
-+ uint8_t * const src = !sliced ?
-+ &s->frame->data[c_idx][y0 * stride_src + (x0 << sh)] :
-+ c_idx == 0 ?
-+ av_rpi_sand_frame_pos_y(s->frame, x0, y0) :
-+ av_rpi_sand_frame_pos_c(s->frame, x0, y0);
-+ const uint8_t * const src_l = edges[0] || !wants_lr ? NULL :
-+ !sliced ? src - (1 << sh) :
-+ c_idx == 0 ?
-+ av_rpi_sand_frame_pos_y(s->frame, x0 - 1, y0) :
-+ av_rpi_sand_frame_pos_c(s->frame, x0 - 1, y0);
-+ const uint8_t * const src_r = edges[2] || !wants_lr ? NULL :
-+ !sliced ? src + (width << sh) :
-+ c_idx == 0 ?
-+ av_rpi_sand_frame_pos_y(s->frame, x0 + width, y0) :
-+ av_rpi_sand_frame_pos_c(s->frame, x0 + width, y0);
-+
-+ if (sliced && c_idx > 1) {
-+ break;
-+ }
-+
-+// if (c_idx == 1)
-+// printf("%d: %dx%d %d,%d: lr=%d\n", c_idx, width, height, x0, y0, wants_lr);
-+
-+ switch (sao->type_idx[c_idx]) {
-+ case SAO_BAND:
-+ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+ x_ctb, y_ctb);
-+ if (s->ps.pps->transquant_bypass_enable_flag ||
-+ s->ps.sps->pcm.loop_filter_disable_flag)
-+ {
-+ // Can't use the edge buffer here as it may be in use by the foreground
-+ DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+ [2*MAX_PB_SIZE*MAX_PB_SIZE];
-+ dst = dstbuf;
-+ stride_dst = 2*MAX_PB_SIZE;
-+ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+ if (sliced && c_idx != 0)
-+ {
-+ s->hevcdsp.sao_band_filter_c[tab](src, dst, stride_src, stride_dst,
-+ sao->offset_val[1], sao->band_position[1],
-+ sao->offset_val[2], sao->band_position[2],
-+ width, height);
-+ }
-+ else
-+ {
-+ s->hevcdsp.sao_band_filter[tab](src, dst, stride_src, stride_dst,
-+ sao->offset_val[c_idx], sao->band_position[c_idx],
-+ width, height);
-+ }
-+ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+ x, y, width, height, c_idx);
-+ } else {
-+ if (sliced && c_idx != 0)
-+ {
-+ s->hevcdsp.sao_band_filter_c[tab](src, src, stride_src, stride_src,
-+ sao->offset_val[1], sao->band_position[1],
-+ sao->offset_val[2], sao->band_position[2],
-+ width, height);
-+ }
-+ else
-+ {
-+ s->hevcdsp.sao_band_filter[tab](src, src, stride_src, stride_src,
-+ sao->offset_val[c_idx], sao->band_position[c_idx],
-+ width, height);
-+ }
-+ }
-+ sao->type_idx[c_idx] = SAO_APPLIED;
-+ break;
-+ case SAO_EDGE:
-+ {
-+ const int w = s->ps.sps->width >> hshift;
-+ const int h = s->ps.sps->height >> vshift;
-+ int top_edge = edges[1];
-+ int bottom_edge = edges[3];
-+ // Can't use the edge buffer here as it may be in use by the foreground
-+ DECLARE_ALIGNED(64, uint8_t, dstbuf)
-+ [RPI_HEVC_SAO_BUF_STRIDE * (MAX_PB_SIZE + 2) + 64];
-+
-+ stride_dst = RPI_HEVC_SAO_BUF_STRIDE;
-+ dst = dstbuf + stride_dst + 32;
-+
-+ if (!top_edge) {
-+ uint8_t *dst1;
-+ int src_idx;
-+ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb - 1) * w + x0) << sh);
-+
-+ dst1 = dst - stride_dst;
-+
-+ if (src_l != NULL) {
-+ src_idx = (CTB(s->sao, x_ctb-1, y_ctb-1).type_idx[c_idx] ==
-+ SAO_APPLIED);
-+ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l - stride_src, sh);
-+ }
-+
-+ src_idx = (CTB(s->sao, x_ctb, y_ctb-1).type_idx[c_idx] ==
-+ SAO_APPLIED);
-+ memcpy(dst1, src_idx ? src_spb : src - stride_src, width << sh);
-+
-+ if (src_r != NULL) {
-+ src_idx = (CTB(s->sao, x_ctb+1, y_ctb-1).type_idx[c_idx] ==
-+ SAO_APPLIED);
-+ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r - stride_src, sh);
-+ }
-+ }
-+ if (!bottom_edge) {
-+ uint8_t * const dst1 = dst + height * stride_dst;
-+ int src_idx;
-+ const uint8_t * const src_spb = s->sao_pixel_buffer_h[c_idx] + (((2 * y_ctb + 2) * w + x0) << sh);
-+ const unsigned int hoff = height * stride_src;
-+
-+ if (src_l != NULL) {
-+ src_idx = (CTB(s->sao, x_ctb-1, y_ctb+1).type_idx[c_idx] ==
-+ SAO_APPLIED);
-+ copy_pixel(dst1 - (1 << sh), src_idx ? src_spb - (1 << sh) : src_l + hoff, sh);
-+ }
-+
-+ src_idx = (CTB(s->sao, x_ctb, y_ctb+1).type_idx[c_idx] ==
-+ SAO_APPLIED);
-+ memcpy(dst1, src_idx ? src_spb : src + hoff, width << sh);
-+
-+ if (src_r != NULL) {
-+ src_idx = (CTB(s->sao, x_ctb+1, y_ctb+1).type_idx[c_idx] ==
-+ SAO_APPLIED);
-+ copy_pixel(dst1 + (width << sh), src_idx ? src_spb + (width << sh) : src_r + hoff, sh);
-+ }
-+ }
-+ if (src_l != NULL) {
-+ if (CTB(s->sao, x_ctb-1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+ ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb - 1) * h + y0) << sh),
-+ sh, height, stride_dst, 1 << sh);
-+ } else {
-+ ff_hevc_rpi_copy_vert(dst - (1 << sh),
-+ src_l,
-+ sh, height, stride_dst, stride_src);
-+ }
-+ }
-+ if (src_r != NULL) {
-+ if (CTB(s->sao, x_ctb+1, y_ctb).type_idx[c_idx] == SAO_APPLIED) {
-+ ff_hevc_rpi_copy_vert(dst + (width << sh),
-+ s->sao_pixel_buffer_v[c_idx] + (((2 * x_ctb + 2) * h + y0) << sh),
-+ sh, height, stride_dst, 1 << sh);
-+ } else {
-+ ff_hevc_rpi_copy_vert(dst + (width << sh),
-+ src_r,
-+ sh, height, stride_dst, stride_src);
-+ }
-+ }
-+
-+ s->hevcdsp.cpy_blk(dst, stride_dst, src, stride_src, width << sh, height);
-+
-+ copy_CTB_to_hv(s, src, stride_src, x0, y0, width, height, c_idx,
-+ x_ctb, y_ctb);
-+ if (sliced && c_idx != 0)
-+ {
-+ // Class always the same for both U & V (which is just as well :-))
-+ s->hevcdsp.sao_edge_filter_c[tab](src, dst, stride_src,
-+ sao->offset_val[1], sao->offset_val[2], sao->eo_class[1],
-+ width, height);
-+ s->hevcdsp.sao_edge_restore_c[restore](src, dst,
-+ stride_src, stride_dst,
-+ sao,
-+ edges, width,
-+ height, c_idx,
-+ vert_edge,
-+ horiz_edge,
-+ diag_edge);
-+ }
-+ else
-+ {
-+ s->hevcdsp.sao_edge_filter[tab](src, dst, stride_src, sao->offset_val[c_idx],
-+ sao->eo_class[c_idx], width, height);
-+ s->hevcdsp.sao_edge_restore[restore](src, dst,
-+ stride_src, stride_dst,
-+ sao,
-+ edges, width,
-+ height, c_idx,
-+ vert_edge,
-+ horiz_edge,
-+ diag_edge);
-+ }
-+ restore_tqb_pixels(s, src, dst, stride_src, stride_dst,
-+ x, y, width, height, c_idx);
-+ sao->type_idx[c_idx] = SAO_APPLIED;
-+ break;
-+ }
-+ }
-+ }
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+ if (s->frame->format == AV_PIX_FMT_SAND64_10 && s->frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL &&
-+ (((x + (1 << (s->ps.sps->log2_ctb_size))) & 255) == 0 || edges[2]))
-+ {
-+ const unsigned int stride1 = frame_stride1(s->frame, 1);
-+ const unsigned int stride2 = av_rpi_sand_frame_stride2(s->frame);
-+ const unsigned int xoff = (x >> 8) * stride2 * stride1;
-+ const unsigned int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+ const uint8_t * const sy = s->frame->data[0] + xoff * 4 + y * stride1;
-+ uint8_t * const dy = s->frame->buf[4]->data + xoff * 2 + y * stride1;
-+ const uint8_t * const sc = s->frame->data[1] + xoff * 4 + (y >> 1) * stride1;
-+ uint8_t * const dc = s->frame->buf[4]->data + (s->frame->data[1] - s->frame->data[0]) + xoff * 2 + (y >> 1) * stride1;
-+ const unsigned int wy = !edges[2] ? 256 : s->ps.sps->width - (x & ~255);
-+ const unsigned int hy = !edges[3] ? ctb_size : s->ps.sps->height - y;
-+
-+// printf("dy=%p/%p, stride1=%d, stride2=%d, sy=%p/%p, wy=%d, hy=%d, x=%d, y=%d, cs=%d\n", dy, dc, stride1, stride2, sy, sc, wy, hy, x, y, ctb_size);
-+ av_rpi_sand16_to_sand8(dy, stride1, stride2, sy, stride1, stride2, wy, hy, 3);
-+ av_rpi_sand16_to_sand8(dc, stride1, stride2, sc, stride1, stride2, wy, hy >> 1, 3);
-+ }
-+#endif
-+}
-+
-+// When bits are delivered to deblock we want them
-+//#define TL 1
-+//#define TR 2
-+//#define BL 4
-+//#define BR 8
-+
-+// pcm4 returns them as b0 = tl, b1 = tr, b16 = bl, b17 = br
-+// so we need to rearrange before passing on
-+
-+static inline uint32_t pcm4(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+ return (pcm[0] |
-+ (pcm[1] << 8) |
-+ (pcm[s->ps.sps->pcm_width] << 16) |
-+ (pcm[s->ps.sps->pcm_width + 1] << 24)) >> ((x >> 3) & 7);
-+}
-+
-+static inline uint32_t pcm2(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+ const uint8_t * const pcm = s->is_pcm + (x >> 6) + (y >> 3) * s->ps.sps->pcm_width;
-+ return (pcm[0] | (pcm[1] << 8)) >> ((x >> 3) & 7);
-+}
-+
-+// We cast away const here as we want this to work for both get and set
-+static inline uint32_t * bs_ptr32(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+ return (uint32_t *)(bs +
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#warning Unexpected masks
-+ // As it happens we end up with stride1 = sizeof(uint32_t) so this expr vanishes
-+ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+ (~3 & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT))) +
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+static inline uint8_t * bs_ptr8(const uint8_t * bs, const unsigned int stride2, const unsigned int x, const unsigned int y)
-+{
-+ return (uint8_t *)(bs +
-+ ((x >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT) &
-+ (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) +
-+ ((y >> HEVC_RPI_BS_Y_SHR) << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT) +
-+ (x >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT) * stride2);
-+}
-+
-+
-+// Get block strength
-+// Given how we call we will always get within the 32bit boundries
-+static inline uint32_t bs_get32(const uint8_t * bs, unsigned int stride2,
-+ unsigned int xl, unsigned int xr, const unsigned int y)
-+{
-+ if (xr <= xl) {
-+ return 0;
-+ }
-+ else
-+ {
-+#if HAVE_ARMV6T2_INLINE
-+#if (~3U & (HEVC_RPI_BS_STRIDE1_PEL_MASK >> HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)) != 0
-+#error This case not yet handled in bs_get32
-+#elif HEVC_RPI_BS_STRIDE1_BYTES < 4
-+#error Stride1 < return size
-+#endif
-+ uint32_t tmp;
-+ __asm__ (
-+ "lsr %[tmp], %[xl], %[xl_shift] \n\t"
-+ "rsb %[xr], %[xl], %[xr] \n\t"
-+ "mla %[stride2], %[stride2], %[tmp], %[bs] \n\t"
-+ "add %[xr], %[xr], #7 \n\t"
-+ "lsr %[bs], %[y], %[y_shift1] \n\t"
-+ "bic %[xr], %[xr], #7 \n\t"
-+ "ubfx %[xl], %[xl], #1, #5 \n\t"
-+ "lsr %[xr], %[xr], #1 \n\t"
-+ "cmp %[xr], #32 \n\t"
-+ "mvn %[tmp], #0 \n\t"
-+ "ldr %[bs], [%[stride2], %[bs], lsl %[y_shift2]] \n\t"
-+ "lsl %[tmp], %[tmp], %[xr] \n\t"
-+ "lsr %[xl], %[bs], %[xl] \n\t"
-+ "it ne \n\t"
-+ "bicne %[bs], %[xl], %[tmp] \n\t"
-+ : // Outputs
-+ [bs]"+r"(bs),
-+ [stride2]"+r"(stride2),
-+ [xl]"+r"(xl),
-+ [xr]"+r"(xr),
-+ [tmp]"=&r"(tmp)
-+ : // Inputs
-+ [y]"r"(y),
-+ [xl_shift]"M"(HEVC_RPI_BS_STRIDE1_PEL_SHIFT),
-+ [y_shift1]"M"(HEVC_RPI_BS_Y_SHR),
-+ [y_shift2]"M"(HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+ : // Clobbers
-+ "cc"
-+ );
-+ return (uint32_t) bs;
-+#else
-+ const uint32_t a = *bs_ptr32(bs, stride2, xl, y);
-+ const unsigned int n = ((xr - xl + 7) & ~7) >> 1;
-+
-+ return n == 32 ? a :
-+ (a >> ((xl >> 1) & 31)) & ~(~0U << n);
-+#endif
-+ }
-+}
-+
-+static inline uint32_t hbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+ return bs_get32(s->bs_horizontal, s->bs_stride2, xl, xr, y);
-+}
-+
-+static inline uint32_t vbs_get32(const HEVCRpiContext * const s, const unsigned int xl, const unsigned int xr, const unsigned int y)
-+{
-+ av_assert2(((xl ^ (xr - 1)) >> s->ps.sps->log2_ctb_size) == 0);
-+ return bs_get32(s->bs_vertical, s->bs_stride2, xl, xr, y);
-+}
-+
-+
-+static void deblock_y_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+ const unsigned int ctb_size = (1 << log2_ctb_size);
-+ const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 : 1);
-+ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+ const DBParams * cb_dbp = s->deblock + ctb_n;
-+ const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+
-+ unsigned int cb_x;
-+
-+ // Do in CTB-shaped blocks
-+ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++cb_dbp)
-+ {
-+ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+ const unsigned int bv_l = FFMAX(cb_x, 8);
-+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r - 8 : cb_x + ctb_size - 9;
-+ const unsigned int bh_l = bv_l - 8;
-+ unsigned int y;
-+
-+ // Main body
-+ for (y = (bounds.y == 0 ? 0 : bounds.y - 8); y < b_b; y += 8)
-+ {
-+ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y);
-+
-+ const DBParams * const dbp = y < bounds.y ? cb_dbp - s->ps.sps->ctb_width : cb_dbp;
-+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+ if (vbs != 0)
-+ {
-+ const uint8_t * const tcv = tctable + dbp->tc_offset;
-+ const uint8_t * const betav = betatable + dbp->beta_offset;
-+ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+ unsigned int x;
-+
-+ for (x = bv_l; vbs != 0; x += 8, vbs >>= 4, pcmfa >>= 1)
-+ {
-+ if ((vbs & 0xf) != 0 && (pcmfa & 3) != 3)
-+ {
-+ const int qp = (qtb[(x - 1) >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ s->hevcdsp.hevc_v_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+ frame_stride1(s->frame, LUMA),
-+ betav[qp],
-+ ((vbs & 3) == 0 ? 0 : tcv[qp + (int)(vbs & 2)]) |
-+ (((vbs & 0xc) == 0 ? 0 : tcv[qp + (int)((vbs >> 2) & 2)]) << 16),
-+ pcmfa & 3,
-+ av_rpi_sand_frame_pos_y(s->frame, x - 4, y));
-+ }
-+ }
-+ }
-+
-+ if (y != 0)
-+ {
-+ uint32_t hbs;
-+
-+ // H left - mostly separated out so we only need a uint32_t hbs
-+ if ((hbs = hbs_get32(s, bh_l, cb_x, y)) != 0)
-+ {
-+ const unsigned int x = bh_l;
-+ const unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ const DBParams * const dbph = dbp - 1;
-+ const uint8_t * const tc = tctable + dbph->tc_offset + qp;
-+
-+ av_assert2(cb_x - bh_l == 8);
-+
-+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+ frame_stride1(s->frame, LUMA),
-+ betatable[qp + dbph->beta_offset],
-+ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+ }
-+
-+ // H
-+ if ((hbs = hbs_get32(s, cb_x, bh_r + 1, y)) != 0) // Will give (x <= bh_r) in for loop
-+ {
-+ unsigned int x;
-+ unsigned int pcmfa = pcm4(s, cb_x, y - 1);
-+
-+ for (x = cb_x; hbs != 0; x += 8, hbs >>= 4, pcmfa >>= 1)
-+ {
-+ if ((hbs & 0xf) != 0 && (~pcmfa & 0x10001) != 0)
-+ {
-+ const int qp = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ const uint8_t * const tc = tctable + dbp->tc_offset + qp;
-+ s->hevcdsp.hevc_h_loop_filter_luma2(av_rpi_sand_frame_pos_y(s->frame, x, y),
-+ frame_stride1(s->frame, LUMA),
-+ betatable[qp + dbp->beta_offset],
-+ ((hbs & 3) == 0 ? 0 : tc[hbs & 2]) |
-+ (((hbs & 0xc) == 0 ? 0 : tc[(hbs >> 2) & 2]) << 16),
-+ (pcmfa & 1) | ((pcmfa & 0x10000) >> 15));
-+ }
-+ }
-+ }
-+ }
-+
-+ }
-+ }
-+}
-+
-+static av_always_inline int q2h(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y)
-+{
-+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+ const int8_t * const qt = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+ return (qt[(x - 1) >> log2_min_cb_size] + qt[x >> log2_min_cb_size] + 1) >> 1;
-+}
-+
-+static void deblock_uv_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int end_x, const int end_y)
-+{
-+ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+ const unsigned int ctb_size = (1 << log2_ctb_size);
-+ const unsigned int cb_r = FFMIN(bounds.x + bounds.w, s->ps.sps->width) - (end_x ? 0 : 8);
-+ const unsigned int ctb_n = (bounds.x + bounds.y * s->ps.sps->ctb_width) >> log2_ctb_size;
-+ const DBParams * dbp = s->deblock + ctb_n;
-+ const unsigned int b_b = FFMIN(bounds.y + bounds.h, s->ps.sps->height) - (end_y ? 0 : 8);
-+ const uint8_t * const tcq_u = s->ps.pps->qp_dblk_x[1];
-+ const uint8_t * const tcq_v = s->ps.pps->qp_dblk_x[2];
-+
-+ unsigned int cb_x;
-+
-+ av_assert1((bounds.x & (ctb_size - 1)) == 0);
-+ av_assert1((bounds.y & (ctb_size - 1)) == 0);
-+ av_assert1(bounds.h <= ctb_size);
-+
-+ // Do in CTB-shaped blocks
-+ for (cb_x = bounds.x; cb_x < cb_r; cb_x += ctb_size, ++dbp) {
-+ const unsigned int bv_r = FFMIN(cb_x + ctb_size, cb_r);
-+ const unsigned int bv_l = FFMAX(cb_x, 16);
-+ unsigned int y;
-+
-+ // V above
-+ if (bounds.y != 0) {
-+ // Deblock V up 8
-+ // CTB above current
-+ // Top-half only (tc4 & ~0xffff == 0) is special cased in asm
-+ const unsigned int y = bounds.y - 8;
-+ uint32_t vbs = vbs_get32(s, bv_l, bv_r, y) & 0x02020202U;
-+
-+ if (vbs != 0)
-+ {
-+ unsigned int pcmfa = pcm2(s, bv_l - 1, y);
-+ const uint8_t * const tc = tctable + 2 + (dbp - s->ps.sps->ctb_width)->tc_offset;
-+ unsigned int x;
-+
-+ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+ {
-+ if ((vbs & 2) != 0 && (~pcmfa & 3) != 0)
-+ {
-+ const int qp0 = q2h(s, x, y);
-+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+ frame_stride1(s->frame, 1),
-+ tc[tcq_u[qp0]] | (tc[tcq_v[qp0]] << 8),
-+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+ pcmfa & 3);
-+ }
-+ }
-+ }
-+ }
-+
-+ for (y = bounds.y; y < b_b; y += 16)
-+ {
-+ uint32_t vbs = (vbs_get32(s, bv_l, bv_r, y) & 0x02020202U) |
-+ (y + 16 > b_b ? 0 : (vbs_get32(s, bv_l, bv_r, y + 8) & 0x02020202U) << 4);
-+
-+ // V
-+ if (vbs != 0)
-+ {
-+ unsigned int x;
-+ unsigned int pcmfa =
-+ (y + 16 > b_b ?
-+ pcm2(s, bv_l - 1, y) | 0xffff0000 :
-+ pcm4(s, bv_l - 1, y));
-+ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+ for (x = bv_l; vbs != 0; x += 16, vbs >>= 8, pcmfa >>= 2)
-+ {
-+ if ((vbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+ {
-+ const int qp0 = q2h(s, x, y);
-+ const int qp1 = q2h(s, x, y + 8);
-+ s->hevcdsp.hevc_v_loop_filter_uv2(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+ frame_stride1(s->frame, 1),
-+ ((vbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((vbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+ av_rpi_sand_frame_pos_c(s->frame, (x >> 1) - 2, y >> 1),
-+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+ }
-+ }
-+ }
-+
-+ // H
-+ if (y != 0)
-+ {
-+ uint32_t hbs;
-+ const unsigned int bh_l = bv_l - 16;
-+ const unsigned int bh_r = cb_x + ctb_size >= cb_r ? cb_r : cb_x + ctb_size - 16;
-+ const int8_t * const qta = s->qp_y_tab + ((y - 1) >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+ const int8_t * const qtb = s->qp_y_tab + (y >> log2_min_cb_size) * s->ps.sps->min_cb_width;
-+
-+ // H left - mostly separated out so we only need a uint32_t hbs
-+ // Stub is width 8 to the left of bounds, but width 16 internally
-+ if ((hbs = hbs_get32(s, bh_l, cb_x, y) & 0x22U) != 0)
-+ {
-+ unsigned int pcmfa = pcm4(s, bh_l, y - 1);
-+
-+ // Chop off bits we don't want...
-+ if (bh_l < bounds.x) {
-+ pcmfa |= 0x10001; // TL|BL pre rearrangement
-+ hbs &= ~3; // Make BS 0
-+ }
-+
-+ // Double check we still want this
-+ if (hbs != 0 && (~pcmfa & 0x30003) != 0)
-+ {
-+ const unsigned int x = bh_l;
-+ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+ const uint8_t * const tc = tctable + 2 + (dbp - 1)->tc_offset;
-+
-+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+ frame_stride1(s->frame, 1),
-+ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+ }
-+ }
-+
-+ // H main
-+ if ((hbs = (hbs_get32(s, cb_x, bh_r, y) & 0x22222222U)) != 0)
-+ {
-+ unsigned int x;
-+ unsigned int pcmfa = pcm4(s, cb_x, y - 1); // Might like to mask out far right writes but probably not worth it
-+
-+ for (x = cb_x; hbs != 0; x += 16, hbs >>= 8, pcmfa >>= 2)
-+ {
-+ if ((hbs & 0xff) != 0 && (~pcmfa & 0x30003) != 0)
-+ {
-+ const int qp0 = (qta[x >> log2_min_cb_size] + qtb[x >> log2_min_cb_size] + 1) >> 1;
-+ const int qp1 = (qta[(x + 8) >> log2_min_cb_size] + qtb[(x + 8) >> log2_min_cb_size] + 1) >> 1;
-+ const uint8_t * const tc = tctable + 2 + dbp->tc_offset;
-+
-+ s->hevcdsp.hevc_h_loop_filter_uv(av_rpi_sand_frame_pos_c(s->frame, x >> 1, y >> 1),
-+ frame_stride1(s->frame, 1),
-+ ((hbs & 2) == 0 ? 0 : (tc[tcq_u[qp0]] << 0) | (tc[tcq_v[qp0]] << 8)) |
-+ ((hbs & 0x20) == 0 ? 0 : (tc[tcq_u[qp1]] << 16) | (tc[tcq_v[qp1]] << 24)),
-+ (pcmfa & 3) | ((pcmfa >> 14) & 0xc));
-+ }
-+ }
-+ }
-+ }
-+ }
-+ }
-+}
-+
-+static inline unsigned int off_boundary(const unsigned int x, const unsigned int log2_n)
-+{
-+ return x & ~(~0U << log2_n);
-+}
-+
-+static inline void hbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+ av_assert2((y & 7) == 0);
-+
-+ // This doesn't have the same simultainious update issues that bsf_stash
-+ // does (other threads will have a different y) so we can do it the easy way
-+ if ((bsf &= mask) != 0)
-+ *bs_ptr32(s->bs_horizontal, s->bs_stride2, x, y) |= bsf << ((x >> 1) & 31);
-+}
-+
-+
-+static void vbs_set(const HEVCRpiContext * const s, const unsigned int x, const unsigned int y, const uint32_t mask, uint32_t bsf)
-+{
-+ // We arrange this in a slightly odd fashion but it lines up with
-+ // how we are going to use it in the actual deblock code & it is easier
-+ // to do the contortions here than there
-+ //
-+ // Arrange (LE) {x0y0, x0y4, x8y0, x8,y4}, {x16y0, x16y4, x24y0, x24y4},...
-+
-+ av_assert2((x & 7) == 0);
-+
-+ if ((bsf &= mask) != 0)
-+ {
-+ uint8_t *p = bs_ptr8(s->bs_vertical, s->bs_stride2, x, y);
-+ const unsigned int sh = ((x & 8) | (y & 4)) >> 1;
-+
-+ if (mask <= 0xf)
-+ {
-+ *p |= (bsf << sh);
-+ }
-+ else
-+ {
-+ do {
-+ *p |= (bsf & 0xf) << sh;
-+ p += HEVC_RPI_BS_STRIDE1_BYTES;
-+ } while ((bsf >>= 4) != 0);
-+ }
-+ }
-+}
-+
-+static inline uint32_t bsf_mv(const HEVCRpiContext * const s,
-+ const unsigned int rep, const unsigned int dup,
-+ const unsigned int mvf_stride0,
-+ const unsigned int mvf_stride1,
-+ const RefPicList * const rpl_p, const RefPicList * const rpl_q,
-+ const HEVCRpiMvField * const mvf_p, const HEVCRpiMvField * const mvf_q)
-+{
-+ return s->hevcdsp.hevc_deblocking_boundary_strengths(rep, dup,
-+ mvf_p, mvf_q,
-+ rpl_p[0].list, rpl_p[1].list, rpl_q[0].list, rpl_q[1].list,
-+ sizeof(HEVCRpiMvField) * mvf_stride0, sizeof(HEVCRpiMvField) * mvf_stride1);
-+}
-+
-+
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s,
-+ const HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_trafo_size,
-+ const int is_coded_block)
-+{
-+ const HEVCRpiMvField * const mvf_curr = mvf_stash_ptr(s, lc, x0, y0);
-+ const unsigned int log2_min_pu_size = LOG2_MIN_PU_SIZE;
-+ const RefPicList * const rpl = s->refPicList;
-+ // Rep count for bsf_mv when running with min_pu chuncks
-+ const unsigned int log2_rep_min_pu = log2_trafo_size <= log2_min_pu_size ? 0 : log2_trafo_size - log2_min_pu_size;
-+ const unsigned int boundary_flags = s->sh.no_dblk_boundary_flags & lc->boundary_flags;
-+ const unsigned int trafo_size = (1U << log2_trafo_size);
-+ const uint32_t bsf_mask = log2_trafo_size > 5 ? ~0U : (1U << (trafo_size >> 1)) - 1;
-+ const uint32_t bsf_cbf = (bsf_mask & 0x55555555);
-+
-+ // Do we cover a pred split line?
-+ const int has_x_split = x0 < lc->cu.x_split && x0 + trafo_size > lc->cu.x_split;
-+ const int has_y_split = y0 < lc->cu.y_split && y0 + trafo_size > lc->cu.y_split;
-+
-+ uint32_t bsf_h;
-+ uint32_t bsf_v;
-+
-+#ifdef DISABLE_STRENGTHS
-+ return;
-+#endif
-+
-+ // We are always on a size boundary
-+ av_assert2((x0 & (trafo_size - 1)) == 0);
-+ av_assert2((y0 & (trafo_size - 1)) == 0);
-+ // log2_trafo_size not really a transform size; we can have to deal
-+ // with size 2^6 blocks
-+ av_assert2(log2_trafo_size >= 2 && log2_trafo_size <= 6);
-+
-+ // Retrieve and update coded (b0), intra (b1) bs flags
-+ //
-+ // Store on min width (rather than uint32_t) to avoid possible issues
-+ // with another thread on another core running wpp using the same
-+ // memory (min CTB = 16 pels = 4 bsf els = 8 bits)
-+ //
-+ // In bsf BS=2 is represented by 3 as it is much easier to test & set
-+ // and the actual deblock code tests for 0 and b1 set/not-set so 2 and
-+ // 3 will work the same
-+ {
-+ // Given where we are called from is_cbf_luma & is_intra will be constant over the block
-+ const uint32_t bsf0 = (lc->cu.pred_mode == MODE_INTRA) ? bsf_mask : is_coded_block ? bsf_cbf : 0;
-+ uint8_t *const p = s->bsf_stash_up + (x0 >> 4);
-+ uint8_t *const q = s->bsf_stash_left + (y0 >> 4);
-+
-+ switch (log2_trafo_size)
-+ {
-+ case 2:
-+ case 3:
-+ {
-+ const unsigned int sh_h = (x0 >> 1) & 7;
-+ const unsigned int sh_v = (y0 >> 1) & 7;
-+ bsf_h = *p;
-+ bsf_v = *q;
-+ *p = (bsf_h & ~(bsf_mask << sh_h)) | (bsf0 << sh_h);
-+ *q = (bsf_v & ~(bsf_mask << sh_v)) | (bsf0 << sh_v);
-+ bsf_h >>= sh_h;
-+ bsf_v >>= sh_v;
-+ break;
-+ }
-+ case 4:
-+ bsf_h = *p;
-+ bsf_v = *q;
-+ *p = bsf0;
-+ *q = bsf0;
-+ break;
-+ case 5:
-+ bsf_h = *(uint16_t *)p;
-+ bsf_v = *(uint16_t *)q;
-+ *(uint16_t *)p = bsf0;
-+ *(uint16_t *)q = bsf0;
-+ break;
-+ case 6:
-+ default:
-+ bsf_h = *(uint32_t *)p;
-+ bsf_v = *(uint32_t *)q;
-+ *(uint32_t *)p = bsf0;
-+ *(uint32_t *)q = bsf0;
-+ break;
-+ }
-+
-+ bsf_h |= bsf0;
-+ bsf_v |= bsf0;
-+ }
-+
-+ // Do Horizontal
-+ if ((y0 & 7) == 0)
-+ {
-+ // Boundary upper
-+ if (y0 != 0 &&
-+ (off_boundary(y0, s->ps.sps->log2_ctb_size) ||
-+ (boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0))
-+ {
-+ // Look at MVs (BS=1) if we don't already has a full set of bs bits
-+ if ((~bsf_h & bsf_cbf) != 0 && (y0 == lc->cu.y || y0 == lc->cu.y_split))
-+ {
-+ // If we aren't on the top boundary we must be in the middle
-+ // and in that case we know where mvf can change
-+ const unsigned int log2_rep = (y0 == lc->cu.y) ? log2_rep_min_pu : has_x_split ? 1 : 0;
-+ const RefPicList *const rpl_top = !off_boundary(y0, s->ps.sps->log2_ctb_size) ?
-+ s->rpl_up[x0 >> s->ps.sps->log2_ctb_size] :
-+ rpl;
-+
-+ bsf_h |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+ trafo_size >> (log2_min_pu_size + log2_rep),
-+ trafo_size >> (log2_min_pu_size + log2_rep),
-+ rpl, rpl_top,
-+ mvf_curr, mvf_ptr(s, lc, x0, y0, x0, y0 - 1));
-+ }
-+
-+ // Finally put the results into bs
-+ hbs_set(s, x0, y0, bsf_mask, bsf_h);
-+ }
-+
-+ // Max of 1 pu internal split - ignore if not on 8pel boundary
-+ if (has_y_split && !off_boundary(lc->cu.y_split, 3))
-+ {
-+ const HEVCRpiMvField * const mvf = mvf_stash_ptr(s, lc, x0, lc->cu.y_split);
-+ // If we have the x split as well then it must be in the middle
-+ const unsigned int log2_rep = has_x_split ? 1 : 0;
-+
-+ hbs_set(s, x0, lc->cu.y_split, bsf_mask,
-+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+ trafo_size >> (log2_min_pu_size + log2_rep),
-+ trafo_size >> (log2_min_pu_size + log2_rep),
-+ rpl, rpl,
-+ mvf, mvf - MVF_STASH_WIDTH_PU));
-+ }
-+ }
-+
-+ // And again for vertical - same logic as horizontal just in the other direction
-+ if ((x0 & 7) == 0)
-+ {
-+ // Boundary left
-+ if (x0 != 0 &&
-+ (off_boundary(x0, s->ps.sps->log2_ctb_size) ||
-+ (boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0))
-+ {
-+ if ((~bsf_v & bsf_cbf) != 0 && (x0 == lc->cu.x || x0 == lc->cu.x_split))
-+ {
-+ const unsigned int log2_rep = (x0 == lc->cu.x) ? log2_rep_min_pu : has_y_split ? 1 : 0;
-+ const RefPicList *const rpl_left = !off_boundary(x0, s->ps.sps->log2_ctb_size) ?
-+ s->rpl_left[y0 >> s->ps.sps->log2_ctb_size] :
-+ rpl;
-+
-+ bsf_v |= bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+ (mvf_left_stride(s, x0, x0 - 1) << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+ rpl, rpl_left,
-+ mvf_curr, mvf_ptr(s, lc, x0, y0, x0 - 1, y0));
-+ }
-+
-+ vbs_set(s, x0, y0, bsf_mask, bsf_v);
-+ }
-+
-+ if (has_x_split && !off_boundary(lc->cu.x_split, 3))
-+ {
-+ const HEVCRpiMvField *const mvf = mvf_stash_ptr(s, lc, lc->cu.x_split, y0);
-+ const unsigned int log2_rep = has_y_split ? 1 : 0;
-+
-+ vbs_set(s, lc->cu.x_split, y0, bsf_mask,
-+ bsf_mv(s, 1 << log2_rep, trafo_size >> (2 + log2_rep),
-+ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+ (MVF_STASH_WIDTH_PU << log2_trafo_size) >> (log2_min_pu_size + log2_rep),
-+ rpl, rpl,
-+ mvf, mvf - 1));
-+ }
-+ }
-+}
-+
-+#undef LUMA
-+#undef CB
-+#undef CR
-+
-+static inline unsigned int ussub(const unsigned int a, const unsigned int b)
-+{
-+ return a < b ? 0 : a - b;
-+}
-+
-+static inline int cache_boundry(const AVFrame * const frame, const unsigned int x)
-+{
-+ return ((x >> av_rpi_sand_frame_xshl(frame)) & ~63) == 0;
-+}
-+
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot)
-+{
-+ const int ctb_size = (1 << s->ps.sps->log2_ctb_size);
-+ int x, y;
-+
-+ const unsigned int br = FFMIN(bounds.x + bounds.w, s->ps.sps->width);
-+ const unsigned int bb = FFMIN(bounds.y + bounds.h, s->ps.sps->height);
-+
-+ const int x_end = (br >= s->ps.sps->width);
-+ const int y_end = (bb >= s->ps.sps->height);
-+
-+ // Deblock may not touch the edges of the bound as they are still needed
-+ // for Intra pred
-+ //
-+ // Deblock is disabled with a per-slice flag
-+ // Given that bounds may cover multiple slices & we dblock outside bounds
-+ // anyway we can't avoid deblock using that flag - about the only thing we
-+ // could do is have a "no deblock seen yet" flag but it doesn't really
-+ // seem worth the effort
-+
-+ deblock_y_blk(s, bounds, x_end, y_end);
-+ deblock_uv_blk(s, bounds, x_end, y_end);
-+
-+ // SAO needs
-+ // (a) CTB alignment
-+ // (b) Valid pixels all the way around the CTB in particular it needs the DR pixel
-+ {
-+ const unsigned int xo = bounds.x - ((bounds.x - 16) & ~(ctb_size - 1));
-+ const unsigned int yo = bounds.y - ((bounds.y - 16) & ~(ctb_size - 1));
-+ const unsigned int yt = ussub(bounds.y, yo);
-+ const unsigned int yb = y_end ? bb : ussub(bb, yo);
-+ const unsigned int xl = ussub(bounds.x, xo);
-+ const unsigned int xr = x_end ? br : ussub(br, xo);
-+
-+ if (s->ps.sps->sao_enabled)
-+ {
-+ for (y = yt; y < yb; y += ctb_size) {
-+ for (x = xl; x < xr; x += ctb_size) {
-+ sao_filter_CTB(s, x, y);
-+ }
-+ }
-+ }
-+
-+ // Cache invalidate
-+ y = 0;
-+ if (xr != 0 && yb != 0)
-+ {
-+ const unsigned int llen =
-+ (av_rpi_sand_frame_stride1(s->frame) >> av_rpi_sand_frame_xshl(s->frame));
-+ const unsigned int mask = ~(llen - 1);
-+ const unsigned int il = (xl == 0) ? 0 : (xl - 1) & mask;
-+ const unsigned int ir = x_end || !cache_boundry(s->frame, br) ? br : (xr - 1) & mask;
-+ const unsigned int it = ussub(yt, 1);
-+ const unsigned int ib = y_end ? bb : yb - 1;
-+
-+ if (il < ir) {
-+ rpi_cache_buf_t cbuf;
-+ rpi_cache_flush_env_t * const rfe = rpi_cache_flush_init(&cbuf);
-+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+ il, it, ir - il, ib - it,
-+ ctx_vshift(s, 1), 1, 1);
-+
-+ // If we have to commit the right hand tile boundry due to
-+ // cache boundry considerations then at EoTile we must commit
-+ // that boundry to bottom of tile (bounds)
-+ if (ib != bb && ir == br && eot) {
-+ rpi_cache_flush_add_frame_block(rfe, s->frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE,
-+ br - 1, ib, 1, bb - ib,
-+ ctx_vshift(s, 1), 1, 1);
-+ }
-+
-+ rpi_cache_flush_finish(rfe);
-+
-+ if (x_end)
-+ y = y_end ? INT_MAX : ib;
-+
-+// printf("Flush: %4d,%4d -> %4d,%4d: signal: %d\n", il, it, ir, ib, y - 1);
-+ }
-+ }
-+ }
-+
-+ return y;
-+}
-+
-diff --git a/libavcodec/rpi_hevc_mv.h b/libavcodec/rpi_hevc_mv.h
-new file mode 100644
-index 0000000000..6b36f5e737
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mv.h
-@@ -0,0 +1,71 @@
-+#ifndef AVCODEC_RPI_HEVC_MV_H
-+#define AVCODEC_RPI_HEVC_MV_H
-+
-+#include "config.h"
-+
-+typedef int32_t MvXY;
-+
-+typedef struct HEVCRpiMvField {
-+ MvXY xy[2];
-+ int8_t ref_idx[2];
-+ int8_t pred_flag;
-+ int8_t dummy; // To 12 bytes
-+} HEVCRpiMvField;
-+
-+
-+#define MV_X(xy) (((xy) << 16) >> 16)
-+#define MV_Y(xy) ((xy) >> 16)
-+#define MV_XY(x, y) ((x & 0xffff) | ((y) << 16))
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_mv_arm.h"
-+#endif
-+
-+#ifndef mvxy_add
-+static inline MvXY mvxy_add(const MvXY a, const MvXY b)
-+{
-+ return MV_XY(MV_X(a) + MV_X(b), MV_Y(a) + MV_Y(b));
-+}
-+#endif
-+
-+
-+#ifndef mv_scale_xy
-+static inline MvXY mv_scale_xy(const MvXY const src, int td, int tb)
-+{
-+ int tx, scale_factor;
-+
-+ td = td == 0 ? 1 : av_clip_int8(td);
-+ tb = av_clip_int8(tb);
-+ tx = (0x4000 + (abs(td) >> 1)) / td;
-+ scale_factor = av_clip_intp2((tb * tx + 32) >> 6, 12);
-+ return MV_XY(
-+ av_clip_int16((scale_factor * MV_X(src) + 127 +
-+ (scale_factor * MV_X(src) < 0)) >> 8),
-+ av_clip_int16((scale_factor * MV_Y(src) + 127 +
-+ (scale_factor * MV_Y(src) < 0)) >> 8));
-+}
-+#endif
-+
-+// 8.3.1 states that the bitstream may not contain poc diffs that do not
-+// fit in 16 bits, so given that we don't care about the high bits we only
-+// store the low 16 + LT & Inter flags
-+
-+#define COL_POC_INTRA 0
-+#define COL_POC_INTER (1 << 16)
-+#define COL_POC_LT (1 << 17)
-+#define COL_POC_DIFF(x,y) ((int16_t)((x) - (y)))
-+#define COL_POC_MAKE_INTER(lt,poc) (COL_POC_INTER | ((lt) ? COL_POC_LT : 0) | ((poc) & 0xffff))
-+#define COL_POC_IS_LT(x) (((x) & COL_POC_LT) != 0)
-+
-+typedef struct ColMv_s {
-+ int32_t poc;
-+ int32_t xy;
-+} ColMv;
-+
-+typedef struct ColMvField_s {
-+ ColMv L[2];
-+} ColMvField;
-+
-+
-+
-+#endif // AVCODEC_RPI_HEVC_MV_H
-diff --git a/libavcodec/rpi_hevc_mvs.c b/libavcodec/rpi_hevc_mvs.c
-new file mode 100644
-index 0000000000..221755fb6e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_mvs.c
-@@ -0,0 +1,486 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 Anand Meher Kotra
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+static av_always_inline int
-+is_eq_mer(const unsigned int plevel,
-+ const unsigned int xN, const unsigned int yN,
-+ const unsigned int xP, const unsigned int yP)
-+{
-+ return (((xN ^ xP) | (yN ^ yP)) >> plevel) == 0;
-+}
-+
-+// check if the mv's and refidx are the same between A and B
-+static av_always_inline int compare_mv_ref_idx(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+ return a->pred_flag == b->pred_flag &&
-+ ((a->pred_flag & PF_L0) == 0 || (a->ref_idx[0] == b->ref_idx[0] && a->xy[0] == b->xy[0])) &&
-+ ((a->pred_flag & PF_L1) == 0 || (a->ref_idx[1] == b->ref_idx[1] && a->xy[1] == b->xy[1]));
-+ return 0;
-+}
-+
-+/*
-+ * 8.5.3.1.7 temporal luma motion vector prediction
-+ */
-+static int temporal_luma_motion_vector(const HEVCRpiContext * const s,
-+ const HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+ const int nPbW, const int nPbH, const int refIdxLx,
-+ MvXY * const mvLXCol, const int X)
-+{
-+ int x, y;
-+ const ColMv * cmv = NULL;
-+
-+ HEVCRpiFrame * const col_ref = s->ref->collocated_ref;
-+ const RefPicList * const refPicList = s->refPicList + X;
-+ const int cur_lt = refPicList->isLongTerm[refIdxLx];
-+
-+ *mvLXCol = 0;
-+ // Unlikely but we might have a col_ref IDR frame!
-+ if (col_ref->col_mvf == NULL)
-+ return 0;
-+
-+ ff_hevc_rpi_progress_wait_mv(s, lc->jb0, col_ref, y0 + nPbH);
-+
-+ //bottom right collocated motion vector
-+ x = x0 + nPbW;
-+ y = y0 + nPbH;
-+
-+ if ((y0 >> s->ps.sps->log2_ctb_size) == (y >> s->ps.sps->log2_ctb_size) &&
-+ y < s->ps.sps->height &&
-+ x < s->ps.sps->width)
-+ {
-+ const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+ (y >> 4) * s->col_mvf_stride;
-+
-+ if (col->L[0].poc != COL_POC_INTRA &&
-+ (col->L[1].poc == COL_POC_INTRA ||
-+ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+ {
-+ cmv = col->L + 0;
-+ }
-+ else if (col->L[1].poc != COL_POC_INTRA)
-+ {
-+ cmv = col->L + 1;
-+ }
-+ }
-+
-+ // derive center collocated motion vector
-+ if (cmv == NULL || COL_POC_IS_LT(cmv->poc) != cur_lt)
-+ {
-+ cmv = NULL;
-+ x = x0 + (nPbW >> 1);
-+ y = y0 + (nPbH >> 1);
-+
-+ {
-+ const ColMvField * const col = col_ref->col_mvf + (x >> 4) +
-+ (y >> 4) * s->col_mvf_stride;
-+
-+ if (col->L[0].poc != COL_POC_INTRA &&
-+ (col->L[1].poc == COL_POC_INTRA ||
-+ (s->no_backward_pred_flag ? s->sh.collocated_list == L1 : X == 0)))
-+ {
-+ cmv = col->L + 0;
-+ }
-+ else if (col->L[1].poc != COL_POC_INTRA)
-+ {
-+ cmv = col->L + 1;
-+ }
-+ }
-+ }
-+
-+ if (cmv == NULL || cur_lt != COL_POC_IS_LT(cmv->poc))
-+ return 0;
-+
-+ {
-+ const int col_poc = col_ref->poc;
-+ const int ref_poc = refPicList->list[refIdxLx];
-+
-+ *mvLXCol = (cur_lt ||
-+ cmv->poc == col_poc ||
-+ COL_POC_DIFF(col_poc, cmv->poc) == s->poc - ref_poc) ?
-+ cmv->xy :
-+ mv_scale_xy(cmv->xy, COL_POC_DIFF(col_poc, cmv->poc), s->poc - ref_poc);
-+ }
-+
-+ return cmv != NULL;
-+}
-+
-+static inline int mvf_eq(const HEVCRpiMvField * const a, const HEVCRpiMvField * const b)
-+{
-+ return b != NULL && compare_mv_ref_idx(a, b);
-+}
-+
-+
-+
-+/*
-+ * 8.5.3.1.2 Derivation process for spatial merging candidates
-+ */
-+static inline const HEVCRpiMvField *
-+derive_spatial_merge_candidates(
-+ const HEVCRpiContext * const s,
-+ const HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int nPbW, const unsigned int nPbH,
-+ const unsigned int avail,
-+ const unsigned int part_idx,
-+ const unsigned int merge_idx,
-+ HEVCRpiMvField * const mvf_t)
-+{
-+ const unsigned int parts_a1 = (1 << PART_Nx2N) | (1 << PART_nLx2N) | (1 << PART_nRx2N);
-+ const unsigned int parts_b1 = (1 << PART_2NxN) | (1<< PART_2NxnU) | (1 << PART_2NxnD);
-+
-+ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+ const unsigned int plevel = s->ps.pps->log2_parallel_merge_level;
-+ const unsigned int part_mode = lc->cu.part_mode;
-+
-+ const HEVCRpiMvField * perm[4];
-+ unsigned int nb_merge_cand = 0;
-+
-+ // singleMCLFlag => part_idx == 0 so no need to test for it
-+ if ((avail & AVAIL_L) == 0 ||
-+ (part_idx == 1 &&
-+ ((parts_a1 >> part_mode) & 1) != 0 ||
-+ is_eq_mer(plevel, x0 - 1, y0 + nPbH - 1, x0, y0)) ||
-+ mvf_a1->pred_flag == PF_INTRA)
-+ {
-+ mvf_a1 = NULL;
-+ }
-+ else
-+ {
-+ if (merge_idx == nb_merge_cand)
-+ return mvf_a1;
-+ perm[nb_merge_cand++] = mvf_a1;
-+ }
-+
-+ if ((avail & AVAIL_U) == 0 ||
-+ (part_idx == 1 &&
-+ ((parts_b1 >> part_mode) & 1) != 0 ||
-+ is_eq_mer(plevel, x0 + nPbW - 1, y0 - 1, x0, y0)) ||
-+ mvf_b1->pred_flag == PF_INTRA)
-+ {
-+ mvf_b1 = NULL;
-+ }
-+ else if (!mvf_eq(mvf_b1, mvf_a1))
-+ {
-+ if (merge_idx == nb_merge_cand)
-+ return mvf_b1;
-+ perm[nb_merge_cand++] = mvf_b1;
-+ }
-+
-+ // above right spatial merge candidate
-+ // Never need mvf_b0 again so don't bother zeroing if navail
-+ if ((avail & AVAIL_UR) != 0 &&
-+ !is_eq_mer(plevel, x0 + nPbW, y0 - 1, x0, y0) &&
-+ mvf_b0->pred_flag != PF_INTRA &&
-+ !mvf_eq(mvf_b0, mvf_b1))
-+ {
-+ if (merge_idx == nb_merge_cand)
-+ return mvf_b0;
-+ perm[nb_merge_cand++] = mvf_b0;
-+ }
-+
-+ // left bottom spatial merge candidate
-+ // Never need mvf_a0 again so don't bother zeroing if navail
-+ if ((avail & AVAIL_DL) != 0 &&
-+ !is_eq_mer(plevel, x0 - 1, y0 + nPbH, x0, y0) &&
-+ mvf_a0->pred_flag != PF_INTRA &&
-+ !mvf_eq(mvf_a0, mvf_a1))
-+ {
-+ if (merge_idx == nb_merge_cand)
-+ return mvf_a0;
-+ perm[nb_merge_cand++] = mvf_a0;
-+ }
-+
-+ // above left spatial merge candidate
-+ if (nb_merge_cand != 4 &&
-+ (avail & AVAIL_UL) != 0 &&
-+ !is_eq_mer(plevel, x0 - 1, y0 - 1, x0, y0))
-+ {
-+ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL
-+
-+ if (mvf_b2->pred_flag != PF_INTRA &&
-+ !mvf_eq(mvf_b2, mvf_a1) &&
-+ !mvf_eq(mvf_b2, mvf_b1))
-+ {
-+ if (merge_idx == nb_merge_cand)
-+ return mvf_b2;
-+ perm[nb_merge_cand++] = mvf_b2;
-+ }
-+ }
-+
-+ // temporal motion vector candidate
-+ if (s->sh.slice_temporal_mvp_enabled_flag)
-+ {
-+ static const HEVCRpiMvField mvf_z = {{0}};
-+
-+ *mvf_t = mvf_z;
-+
-+ if (temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+ 0, mvf_t->xy + 0, 0))
-+ mvf_t->pred_flag = PF_L0;
-+
-+ if (s->sh.slice_type == HEVC_SLICE_B &&
-+ temporal_luma_motion_vector(s, lc, x0, y0, nPbW, nPbH,
-+ 0, mvf_t->xy + 1, 1))
-+ mvf_t->pred_flag |= PF_L1;
-+
-+ if (mvf_t->pred_flag != 0)
-+ {
-+ if (merge_idx == nb_merge_cand)
-+ return mvf_t;
-+ perm[nb_merge_cand++] = mvf_t;
-+ }
-+ }
-+
-+ // combined bi-predictive merge candidates (applies for B slices)
-+ if (s->sh.slice_type == HEVC_SLICE_B && nb_merge_cand > 1)
-+ {
-+ unsigned int comb_idx = 0;
-+ const unsigned int cand_count = nb_merge_cand * (nb_merge_cand - 1);
-+ const RefPicList * const refPicList = s->refPicList;
-+
-+ for (comb_idx = 0; comb_idx < cand_count; comb_idx++)
-+ {
-+ static const uint8_t l0_l1_cand_idx[12][2] = {
-+ { 0, 1, },
-+ { 1, 0, },
-+ { 0, 2, },
-+ { 2, 0, },
-+ { 1, 2, },
-+ { 2, 1, },
-+ { 0, 3, },
-+ { 3, 0, },
-+ { 1, 3, },
-+ { 3, 1, },
-+ { 2, 3, },
-+ { 3, 2, },
-+ };
-+
-+ const unsigned int l0_cand_idx = l0_l1_cand_idx[comb_idx][0];
-+ const unsigned int l1_cand_idx = l0_l1_cand_idx[comb_idx][1];
-+ const HEVCRpiMvField * const mvf_c0 = perm[l0_cand_idx];
-+ const HEVCRpiMvField * const mvf_c1 = perm[l1_cand_idx];
-+
-+ if ((mvf_c0->pred_flag & PF_L0) != 0 &&
-+ (mvf_c1->pred_flag & PF_L1) != 0 &&
-+ (refPicList[0].list[mvf_c0->ref_idx[0]] != refPicList[1].list[mvf_c1->ref_idx[1]] ||
-+ mvf_c0->xy[0] != mvf_c1->xy[1]))
-+ {
-+ if (merge_idx == nb_merge_cand++)
-+ {
-+ // Need to be a bit careful as we will construct mvf_t and we
-+ // may already be using that as one of our condidates
-+ // so build & copy rather than build in place
-+ const HEVCRpiMvField mvf_m = {
-+ .xy = {
-+ mvf_c0->xy[0],
-+ mvf_c1->xy[1]},
-+ .ref_idx = {
-+ mvf_c0->ref_idx[0],
-+ mvf_c1->ref_idx[1]},
-+ .pred_flag = PF_BI
-+ };
-+ *mvf_t = mvf_m;
-+ return mvf_t;
-+ }
-+ }
-+ }
-+ }
-+
-+ // "append" Zero motion vector candidates
-+ {
-+ const unsigned int nb_refs = (s->sh.slice_type == HEVC_SLICE_B) ?
-+ FFMIN(s->sh.nb_refs[0], s->sh.nb_refs[1]) : s->sh.nb_refs[0];
-+ const unsigned int zero_idx = merge_idx - nb_merge_cand;
-+
-+ const HEVCRpiMvField mvf_m = {
-+ .xy = {0, 0},
-+ .ref_idx = {
-+ zero_idx < nb_refs ? zero_idx : 0,
-+ (s->sh.slice_type == HEVC_SLICE_B && zero_idx < nb_refs) ? zero_idx : 0},
-+ .pred_flag = (s->sh.slice_type == HEVC_SLICE_B) ? PF_BI : PF_L0
-+ };
-+
-+ *mvf_t = mvf_m;
-+ return mvf_t;
-+ }
-+}
-+
-+
-+// 8.5.3.1.1 Derivation process of luma Mvs for merge mode
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+ int nPbH, int log2_cb_size, int part_idx,
-+ int merge_idx, HEVCRpiMvField * const mv)
-+{
-+ const HEVCRpiMvField * mvf_m = (s->ps.pps->log2_parallel_merge_level > 2 && log2_cb_size == 3) ?
-+ derive_spatial_merge_candidates(s, lc, lc->cu.x, lc->cu.y, 8, 8,
-+ ff_hevc_rpi_tb_avail_flags(s, lc, lc->cu.x, lc->cu.y, 8, 8),
-+ 0, merge_idx, mv) :
-+ derive_spatial_merge_candidates(s, lc, x0, y0, nPbW, nPbH,
-+ ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH),
-+ part_idx, merge_idx, mv);
-+
-+ if (mvf_m != mv)
-+ *mv = *mvf_m;
-+
-+ if (mv->pred_flag == PF_BI && (nPbW + nPbH) == 12)
-+ mv->pred_flag = PF_L0;
-+}
-+
-+
-+static av_always_inline const MvXY *
-+mvf_same_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1, const int poc0, const HEVCRpiMvField * const mvf)
-+{
-+ if (mvf != NULL)
-+ {
-+ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].list[mvf->ref_idx[pfi0]] == poc0)
-+ return mvf->xy + pfi0;
-+ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].list[mvf->ref_idx[pfi1]] == poc0)
-+ return mvf->xy + pfi1;
-+ }
-+ return NULL;
-+}
-+
-+static av_always_inline const MvXY *
-+mvf_other_poc(const RefPicList * const rpl, const unsigned int pfi0, const unsigned int pfi1,
-+ const int islt0, const int poc0, const int poc_cur,
-+ MvXY * const mv_t, const HEVCRpiMvField * const mvf)
-+{
-+ if (mvf != NULL)
-+ {
-+ if (((mvf->pred_flag >> pfi0) & 1) != 0 && rpl[pfi0].isLongTerm[mvf->ref_idx[pfi0]] == islt0)
-+ {
-+ const int poc1 = rpl[pfi0].list[mvf->ref_idx[pfi0]];
-+ if (islt0 || poc1 == poc0) {
-+ return mvf->xy + pfi0;
-+ }
-+ *mv_t = mv_scale_xy(mvf->xy[pfi0], poc_cur - poc1, poc_cur - poc0);
-+ return mv_t;
-+ }
-+ if (((mvf->pred_flag >> pfi1) & 1) != 0 && rpl[pfi1].isLongTerm[mvf->ref_idx[pfi1]] == islt0)
-+ {
-+ const int poc1 = rpl[pfi1].list[mvf->ref_idx[pfi1]];
-+ if (islt0 || poc1 == poc0) {
-+ return mvf->xy + pfi1;
-+ }
-+ *mv_t = mv_scale_xy(mvf->xy[pfi1], poc_cur - poc1, poc_cur - poc0);
-+ return mv_t;
-+ }
-+ }
-+ return NULL;
-+}
-+
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int nPbW, const unsigned int nPbH,
-+ const unsigned int avail,
-+ HEVCRpiMvField * const mv,
-+ const unsigned int mvp_lx_flag, const unsigned int LX)
-+{
-+ const unsigned int pfi0 = LX;
-+ const unsigned int pfi1 = LX == 0 ? 1 : 0;
-+ const RefPicList * const rpl = s->refPicList;
-+ const int poc0 = rpl[LX].list[mv->ref_idx[LX]];
-+ const int poc_cur = s->poc;
-+ const int islt0 = rpl[LX].isLongTerm[mv->ref_idx[LX]];
-+
-+ const HEVCRpiMvField * mvf_a1 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 + nPbH - 1);
-+ const HEVCRpiMvField * mvf_a0 = mvf_a1 + mvf_left_stride(s, x0, x0 - 1);
-+ const HEVCRpiMvField * mvf_b2 = mvf_ptr(s, lc, x0, y0, x0 - 1, y0 - 1); // UL
-+ const HEVCRpiMvField * mvf_b1 = mvf_ptr(s, lc, x0, y0, x0 + nPbW - 1, y0 - 1);
-+ const HEVCRpiMvField * mvf_b0 = mvf_b1 + 1;
-+ const MvXY * mva = NULL;
-+ const MvXY * mvb;
-+ MvXY * const mv_rv = mv->xy + LX;
-+ MvXY mvt_a, mvt_b;
-+
-+ *mv_rv = 0;
-+
-+ if ((avail & AVAIL_DL) == 0 || mvf_a0->pred_flag == PF_INTRA)
-+ mvf_a0 = NULL;
-+ else if ((mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a0)) != NULL && mvp_lx_flag == 0)
-+ goto use_mva;
-+
-+ if ((avail & AVAIL_L) == 0 || mvf_a1->pred_flag == PF_INTRA)
-+ mvf_a1 = NULL;
-+
-+ if (mva == NULL &&
-+ (mva = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_a1)) == NULL &&
-+ (mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a0)) == NULL)
-+ mva = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_a, mvf_a1);
-+
-+ if (mvp_lx_flag == 0 && mva != NULL)
-+ goto use_mva;
-+
-+ if ((avail & AVAIL_UR) == 0 || mvf_b0->pred_flag == PF_INTRA)
-+ mvf_b0 = NULL;
-+ if ((avail & AVAIL_U) == 0 || mvf_b1->pred_flag == PF_INTRA)
-+ mvf_b1 = NULL;
-+ if ((avail & AVAIL_UL) == 0 || mvf_b2->pred_flag == PF_INTRA)
-+ mvf_b2 = NULL;
-+
-+ if ((mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b0)) == NULL &&
-+ (mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b1)) == NULL)
-+ mvb = mvf_same_poc(rpl, pfi0, pfi1, poc0, mvf_b2);
-+
-+ if (mvf_a0 == NULL && mvf_a1 == NULL) {
-+ mva = mvb;
-+ if (mvp_lx_flag == 0 && mva != NULL)
-+ goto use_mva;
-+
-+ if ((mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b0)) == NULL &&
-+ (mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b1)) == NULL)
-+ mvb = mvf_other_poc(rpl, pfi0, pfi1, islt0, poc0, poc_cur, &mvt_b, mvf_b2);
-+ }
-+
-+ if (mva == NULL) {
-+ mva = mvb;
-+ mvb = NULL;
-+ }
-+
-+ if (mvb != NULL && *mva == *mvb) // If A == B then ignore B
-+ mvb = NULL;
-+
-+ if (mvp_lx_flag == 0 && mva != NULL) {
-+ goto use_mva;
-+ }
-+ else if (mvp_lx_flag != 0 && mvb != NULL) {
-+ *mv_rv = *mvb;
-+ }
-+ else if (s->sh.slice_temporal_mvp_enabled_flag && ((mvp_lx_flag == 0 && mva == NULL) || (mvp_lx_flag != 0 && mva != NULL))) {
-+ temporal_luma_motion_vector(s, lc, x0, y0, nPbW,
-+ nPbH, mv->ref_idx[LX],
-+ mv_rv, LX);
-+ }
-+ return;
-+
-+use_mva:
-+ *mv_rv = *mva;
-+ return;
-+}
-+
-diff --git a/libavcodec/rpi_hevc_parse.c b/libavcodec/rpi_hevc_parse.c
-new file mode 100644
-index 0000000000..04f9231acc
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.c
-@@ -0,0 +1,142 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "bytestream.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_parse.h"
-+
-+static int hevc_decode_nal_units(const uint8_t *buf, int buf_size, HEVCRpiParamSets *ps,
-+ HEVCSEIContext *sei, int is_nalff, int nal_length_size,
-+ int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+ int i;
-+ int ret = 0;
-+ H2645Packet pkt = { 0 };
-+
-+ ret = ff_h2645_packet_split(&pkt, buf, buf_size, logctx, is_nalff, nal_length_size, AV_CODEC_ID_HEVC, 1);
-+ if (ret < 0) {
-+ goto done;
-+ }
-+
-+ for (i = 0; i < pkt.nb_nals; i++) {
-+ H2645NAL *nal = &pkt.nals[i];
-+
-+ /* ignore everything except parameter sets and VCL NALUs */
-+ switch (nal->type) {
-+ case HEVC_NAL_VPS:
-+ ret = ff_hevc_rpi_decode_nal_vps(&nal->gb, logctx, ps);
-+ if (ret < 0)
-+ goto done;
-+ break;
-+ case HEVC_NAL_SPS:
-+ ret = ff_hevc_rpi_decode_nal_sps(&nal->gb, logctx, ps, apply_defdispwin);
-+ if (ret < 0)
-+ goto done;
-+ break;
-+ case HEVC_NAL_PPS:
-+ ret = ff_hevc_rpi_decode_nal_pps(&nal->gb, logctx, ps);
-+ if (ret < 0)
-+ goto done;
-+ break;
-+ case HEVC_NAL_SEI_PREFIX:
-+ case HEVC_NAL_SEI_SUFFIX:
-+ ret = ff_hevc_rpi_decode_nal_sei(&nal->gb, logctx, sei, ps, nal->type);
-+ if (ret < 0)
-+ goto done;
-+ break;
-+ default:
-+ av_log(logctx, AV_LOG_VERBOSE, "Ignoring NAL type %d in extradata\n", nal->type);
-+ break;
-+ }
-+ }
-+
-+done:
-+ ff_h2645_packet_uninit(&pkt);
-+ if (err_recognition & AV_EF_EXPLODE)
-+ return ret;
-+
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+ int err_recognition, int apply_defdispwin, void *logctx)
-+{
-+ int ret = 0;
-+ GetByteContext gb;
-+
-+ bytestream2_init(&gb, data, size);
-+
-+ if (size > 3 && (data[0] || data[1] || data[2] > 1)) {
-+ /* It seems the extradata is encoded as hvcC format.
-+ * Temporarily, we support configurationVersion==0 until 14496-15 3rd
-+ * is finalized. When finalized, configurationVersion will be 1 and we
-+ * can recognize hvcC by checking if avctx->extradata[0]==1 or not. */
-+ int i, j, num_arrays, nal_len_size;
-+
-+ *is_nalff = 1;
-+
-+ bytestream2_skip(&gb, 21);
-+ nal_len_size = (bytestream2_get_byte(&gb) & 3) + 1;
-+ num_arrays = bytestream2_get_byte(&gb);
-+
-+ /* nal units in the hvcC always have length coded with 2 bytes,
-+ * so put a fake nal_length_size = 2 while parsing them */
-+ *nal_length_size = 2;
-+
-+ /* Decode nal units from hvcC. */
-+ for (i = 0; i < num_arrays; i++) {
-+ int type = bytestream2_get_byte(&gb) & 0x3f;
-+ int cnt = bytestream2_get_be16(&gb);
-+
-+ for (j = 0; j < cnt; j++) {
-+ // +2 for the nal size field
-+ int nalsize = bytestream2_peek_be16(&gb) + 2;
-+ if (bytestream2_get_bytes_left(&gb) < nalsize) {
-+ av_log(logctx, AV_LOG_ERROR,
-+ "Invalid NAL unit size in extradata.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ ret = hevc_decode_nal_units(gb.buffer, nalsize, ps, sei, *is_nalff,
-+ *nal_length_size, err_recognition, apply_defdispwin,
-+ logctx);
-+ if (ret < 0) {
-+ av_log(logctx, AV_LOG_ERROR,
-+ "Decoding nal unit %d %d from hvcC failed\n",
-+ type, i);
-+ return ret;
-+ }
-+ bytestream2_skip(&gb, nalsize);
-+ }
-+ }
-+
-+ /* Now store right nal length size, that will be used to parse
-+ * all other nals */
-+ *nal_length_size = nal_len_size;
-+ } else {
-+ *is_nalff = 0;
-+ ret = hevc_decode_nal_units(data, size, ps, sei, *is_nalff, *nal_length_size,
-+ err_recognition, apply_defdispwin, logctx);
-+ if (ret < 0)
-+ return ret;
-+ }
-+
-+ return ret;
-+}
-diff --git a/libavcodec/rpi_hevc_parse.h b/libavcodec/rpi_hevc_parse.h
-new file mode 100644
-index 0000000000..4b4d032a16
---- /dev/null
-+++ b/libavcodec/rpi_hevc_parse.h
-@@ -0,0 +1,36 @@
-+/*
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * H.265 parser code
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PARSE_H
-+#define AVCODEC_RPI_HEVC_PARSE_H
-+
-+#include <stdint.h>
-+
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+int ff_hevc_rpi_decode_extradata(const uint8_t *data, int size, HEVCRpiParamSets *ps,
-+ HEVCSEIContext *sei, int *is_nalff, int *nal_length_size,
-+ int err_recognition, int apply_defdispwin, void *logctx);
-+
-+#endif /* AVCODEC_RPI_HEVC_PARSE_H */
-diff --git a/libavcodec/rpi_hevc_ps.c b/libavcodec/rpi_hevc_ps.c
-new file mode 100644
-index 0000000000..891e3a900c
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.c
-@@ -0,0 +1,1936 @@
-+/*
-+ * HEVC Parameter Set decoding
-+ *
-+ * Copyright (C) 2012 - 2103 Guillaume Martres
-+ * Copyright (C) 2012 - 2103 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/imgutils.h"
-+#include "golomb.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevcdec.h"
-+
-+static const uint8_t default_scaling_list_intra[] = {
-+ 16, 16, 16, 16, 17, 18, 21, 24,
-+ 16, 16, 16, 16, 17, 19, 22, 25,
-+ 16, 16, 17, 18, 20, 22, 25, 29,
-+ 16, 16, 18, 21, 24, 27, 31, 36,
-+ 17, 17, 20, 24, 30, 35, 41, 47,
-+ 18, 19, 22, 27, 35, 44, 54, 65,
-+ 21, 22, 25, 31, 41, 54, 70, 88,
-+ 24, 25, 29, 36, 47, 65, 88, 115
-+};
-+
-+static const uint8_t default_scaling_list_inter[] = {
-+ 16, 16, 16, 16, 17, 18, 20, 24,
-+ 16, 16, 16, 17, 18, 20, 24, 25,
-+ 16, 16, 17, 18, 20, 24, 25, 28,
-+ 16, 17, 18, 20, 24, 25, 28, 33,
-+ 17, 18, 20, 24, 25, 28, 33, 41,
-+ 18, 20, 24, 25, 28, 33, 41, 54,
-+ 20, 24, 25, 28, 33, 41, 54, 71,
-+ 24, 25, 28, 33, 41, 54, 71, 91
-+};
-+
-+static const AVRational vui_sar[] = {
-+ { 0, 1 },
-+ { 1, 1 },
-+ { 12, 11 },
-+ { 10, 11 },
-+ { 16, 11 },
-+ { 40, 33 },
-+ { 24, 11 },
-+ { 20, 11 },
-+ { 32, 11 },
-+ { 80, 33 },
-+ { 18, 11 },
-+ { 15, 11 },
-+ { 64, 33 },
-+ { 160, 99 },
-+ { 4, 3 },
-+ { 3, 2 },
-+ { 2, 1 },
-+};
-+
-+
-+// pps_cb_qp_offset: -12,+12
-+// slice_cb_qp_offset: -12,+12 also
-+// "The value of pps_cb_qp_offset + slice_cb_qp_offset shall be in the range of -12 to +12, inclusive."
-+// cr_qp_offset_list[n]: -12,+12
-+// So worst case total offset: -24,+24
-+
-+#define T(n) ((((48+(n))/6-10)<<3) | (48+(n))%6)
-+#define C(B,n) T(B*6+(n) < 0 ? -B*6 : (n) > 51 ? 51 : (n))
-+#define M(B,n) C(B,(-n))
-+
-+// Sizeof the QP_START_BLOCK
-+#define QP_OFFSET_0 (8*6 + 12*2)
-+#define QP_START(B) \
-+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+ M(B,48), M(B,48), M(B,48), M(B,48), M(B,48), M(B,48),\
-+\
-+ M(B,48), M(B,47), M(B,46), M(B,45), M(B,44), M(B,43),\
-+ M(B,42), M(B,41), M(B,40), M(B,39), M(B,38), M(B,37),\
-+ M(B,36), M(B,35), M(B,34), M(B,33), M(B,32), M(B,31),\
-+ M(B,30), M(B,29), M(B,28), M(B,27), M(B,26), M(B,25),\
-+ M(B,24), M(B,23), M(B,22), M(B,21), M(B,20), M(B,19),\
-+ M(B,18), M(B,17), M(B,16), M(B,15), M(B,14), M(B,13),\
-+ M(B,12), M(B,11), M(B,10), M(B, 9), M(B, 8), M(B, 7),\
-+ M(B, 6), M(B, 5), M(B, 4), M(B, 3), M(B, 2), M(B, 1)
-+#define QP_END(B) \
-+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51)
-+
-+#define T1(B)\
-+{\
-+ QP_START(B),\
-+ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+ C(B,29), C(B,30), C(B,31), C(B,32), C(B,33), C(B,33), C(B,34), C(B,34), C(B,35), C(B,35),\
-+ C(B,36), C(B,36), C(B,37), C(B,37), C(B,38), C(B,39), C(B,40), C(B,41), C(B,42), C(B,43),\
-+ C(B,44), C(B,45),\
-+ C(B,46), C(B,47), C(B,48), C(B,49), C(B,50), C(B,51),\
-+ QP_END(B)\
-+}
-+#define T0(B)\
-+{\
-+ QP_START(B),\
-+ C(B, 0), C(B, 1), C(B, 2), C(B, 3), C(B, 4), C(B, 5), C(B, 6), C(B, 7), C(B, 8), C(B, 9),\
-+ C(B,10), C(B,11), C(B,12), C(B,13), C(B,14), C(B,15), C(B,16), C(B,17), C(B,18), C(B,19),\
-+ C(B,20), C(B,21), C(B,22), C(B,23), C(B,24), C(B,25), C(B,26), C(B,27), C(B,28), C(B,29),\
-+ C(B,30), C(B,31), C(B,32), C(B,33), C(B,34), C(B,35), C(B,36), C(B,37), C(B,38), C(B,39),\
-+ C(B,40), C(B,41), C(B,42), C(B,43), C(B,44), C(B,45), C(B,46), C(B,47), C(B,48), C(B,49),\
-+ C(B,50), C(B,51),\
-+ C(B,51), C(B,51), C(B,51), C(B,51), C(B,51), C(B,51),\
-+ QP_END(B)\
-+}
-+
-+#define QP_TABLE_SIZE (QP_OFFSET_0 + 52 + 12*2)
-+
-+static const int8_t qp_c_bd_0[8][QP_TABLE_SIZE] = {T0(0),T0(1),T0(2),T0(3),T0(4),T0(5),T0(6),T0(7)};
-+static const int8_t qp_c_bd_1[8][QP_TABLE_SIZE] = {T1(0),T1(1),T1(2),T1(3),T1(4),T1(5),T1(6),T1(7)};
-+
-+#undef T
-+#undef C
-+#undef QP_END
-+
-+#define C(B,n) ((n)<0?0:(n)>51?51:(n))
-+// We do need a lot of -ve padding to cope with high bit depths that give -ve qps
-+#define QP_DBLK_OFFSET_0 QP_OFFSET_0
-+#define QP_END(B)\
-+ 51, 51, 51, 51, 51, 51
-+
-+// These don't need all the padding we have here (12 top/bottom would be enough)
-+static const uint8_t qp_c_dblk_0[] = T0(0);
-+static const uint8_t qp_c_dblk_1[] = T1(0);
-+
-+#undef T
-+#undef M
-+#undef C
-+#undef QP_END
-+#undef QP_START
-+
-+
-+static void remove_pps(HEVCRpiParamSets * const s, const int id)
-+{
-+ if (s->pps_list[id] && s->pps == (const HEVCRpiPPS*)s->pps_list[id]->data)
-+ s->pps = NULL;
-+ av_buffer_unref(&s->pps_list[id]);
-+}
-+
-+static void remove_sps(HEVCRpiParamSets * const s, const int id)
-+{
-+ int i;
-+ if (s->sps_list[id]) {
-+ if (s->sps == (const HEVCRpiSPS*)s->sps_list[id]->data)
-+ s->sps = NULL;
-+
-+ /* drop all PPS that depend on this SPS */
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->pps_list); i++)
-+ if (s->pps_list[i] && ((HEVCRpiPPS*)s->pps_list[i]->data)->sps_id == id)
-+ remove_pps(s, i);
-+
-+ av_assert0(!(s->sps_list[id] && s->sps == (HEVCRpiSPS*)s->sps_list[id]->data));
-+ }
-+ av_buffer_unref(&s->sps_list[id]);
-+}
-+
-+static void remove_vps(HEVCRpiParamSets * const s, const int id)
-+{
-+ int i;
-+ if (s->vps_list[id]) {
-+ if (s->vps == (const HEVCRpiVPS*)s->vps_list[id]->data)
-+ s->vps = NULL;
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->sps_list); i++)
-+ if (s->sps_list[i] && ((HEVCRpiSPS*)s->sps_list[i]->data)->vps_id == id)
-+ remove_sps(s, i);
-+ }
-+ av_buffer_unref(&s->vps_list[id]);
-+}
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext * const gb, AVCodecContext * const avctx,
-+ ShortTermRPS * const rps, const HEVCRpiSPS * const sps, const int is_slice_header)
-+{
-+ uint8_t rps_predict = 0;
-+ int delta_poc;
-+ int k0 = 0;
-+ int k1 = 0;
-+ int k = 0;
-+ int i;
-+
-+ if (rps != sps->st_rps && sps->nb_st_rps)
-+ rps_predict = get_bits1(gb);
-+
-+ if (rps_predict) {
-+ const ShortTermRPS *rps_ridx;
-+ int delta_rps;
-+ unsigned abs_delta_rps;
-+ uint8_t use_delta_flag = 0;
-+ uint8_t delta_rps_sign;
-+
-+ if (is_slice_header) {
-+ unsigned int delta_idx = get_ue_golomb_long(gb) + 1;
-+ if (delta_idx > sps->nb_st_rps) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Invalid value of delta_idx in slice header RPS: %d > %d.\n",
-+ delta_idx, sps->nb_st_rps);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ rps_ridx = &sps->st_rps[sps->nb_st_rps - delta_idx];
-+ rps->rps_idx_num_delta_pocs = rps_ridx->num_delta_pocs;
-+ } else
-+ rps_ridx = &sps->st_rps[rps - sps->st_rps - 1];
-+
-+ delta_rps_sign = get_bits1(gb);
-+ abs_delta_rps = get_ue_golomb_long(gb) + 1;
-+ if (abs_delta_rps < 1 || abs_delta_rps > 32768) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Invalid value of abs_delta_rps: %d\n",
-+ abs_delta_rps);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ delta_rps = (1 - (delta_rps_sign << 1)) * abs_delta_rps;
-+ for (i = 0; i <= rps_ridx->num_delta_pocs; i++) {
-+ int used = rps->used[k] = get_bits1(gb);
-+
-+ if (!used)
-+ use_delta_flag = get_bits1(gb);
-+
-+ if (used || use_delta_flag) {
-+ if (i < rps_ridx->num_delta_pocs)
-+ delta_poc = delta_rps + rps_ridx->delta_poc[i];
-+ else
-+ delta_poc = delta_rps;
-+ rps->delta_poc[k] = delta_poc;
-+ if (delta_poc < 0)
-+ k0++;
-+ else
-+ k1++;
-+ k++;
-+ }
-+ }
-+
-+ if (k >= FF_ARRAY_ELEMS(rps->used)) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Invalid num_delta_pocs: %d\n", k);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ rps->num_delta_pocs = k;
-+ rps->num_negative_pics = k0;
-+ // sort in increasing order (smallest first)
-+ if (rps->num_delta_pocs != 0) {
-+ int used, tmp;
-+ for (i = 1; i < rps->num_delta_pocs; i++) {
-+ delta_poc = rps->delta_poc[i];
-+ used = rps->used[i];
-+ for (k = i - 1; k >= 0; k--) {
-+ tmp = rps->delta_poc[k];
-+ if (delta_poc < tmp) {
-+ rps->delta_poc[k + 1] = tmp;
-+ rps->used[k + 1] = rps->used[k];
-+ rps->delta_poc[k] = delta_poc;
-+ rps->used[k] = used;
-+ }
-+ }
-+ }
-+ }
-+ if ((rps->num_negative_pics >> 1) != 0) {
-+ int used;
-+ k = rps->num_negative_pics - 1;
-+ // flip the negative values to largest first
-+ for (i = 0; i < rps->num_negative_pics >> 1; i++) {
-+ delta_poc = rps->delta_poc[i];
-+ used = rps->used[i];
-+ rps->delta_poc[i] = rps->delta_poc[k];
-+ rps->used[i] = rps->used[k];
-+ rps->delta_poc[k] = delta_poc;
-+ rps->used[k] = used;
-+ k--;
-+ }
-+ }
-+ } else {
-+ unsigned int prev, nb_positive_pics;
-+ rps->num_negative_pics = get_ue_golomb_long(gb);
-+ nb_positive_pics = get_ue_golomb_long(gb);
-+
-+ if (rps->num_negative_pics >= HEVC_MAX_REFS ||
-+ nb_positive_pics >= HEVC_MAX_REFS) {
-+ av_log(avctx, AV_LOG_ERROR, "Too many refs in a short term RPS.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ rps->num_delta_pocs = rps->num_negative_pics + nb_positive_pics;
-+ if (rps->num_delta_pocs) {
-+ prev = 0;
-+ for (i = 0; i < rps->num_negative_pics; i++) {
-+ delta_poc = get_ue_golomb_long(gb) + 1;
-+ if (delta_poc < 1 || delta_poc > 32768) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Invalid value of delta_poc: %d\n",
-+ delta_poc);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ prev -= delta_poc;
-+ rps->delta_poc[i] = prev;
-+ rps->used[i] = get_bits1(gb);
-+ }
-+ prev = 0;
-+ for (i = 0; i < nb_positive_pics; i++) {
-+ delta_poc = get_ue_golomb_long(gb) + 1;
-+ if (delta_poc < 1 || delta_poc > 32768) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Invalid value of delta_poc: %d\n",
-+ delta_poc);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ prev += delta_poc;
-+ rps->delta_poc[rps->num_negative_pics + i] = prev;
-+ rps->used[rps->num_negative_pics + i] = get_bits1(gb);
-+ }
-+ }
-+ }
-+ return 0;
-+}
-+
-+
-+static int decode_profile_tier_level(GetBitContext * const gb, AVCodecContext * const avctx,
-+ PTLCommon * const ptl)
-+{
-+ int i;
-+
-+ if (get_bits_left(gb) < 2+1+5 + 32 + 4 + 16 + 16 + 12)
-+ return -1;
-+
-+ ptl->profile_space = get_bits(gb, 2);
-+ ptl->tier_flag = get_bits1(gb);
-+ ptl->profile_idc = get_bits(gb, 5);
-+ if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN)
-+ av_log(avctx, AV_LOG_DEBUG, "Main profile bitstream\n");
-+ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_10)
-+ av_log(avctx, AV_LOG_DEBUG, "Main 10 profile bitstream\n");
-+ else if (ptl->profile_idc == FF_PROFILE_HEVC_MAIN_STILL_PICTURE)
-+ av_log(avctx, AV_LOG_DEBUG, "Main Still Picture profile bitstream\n");
-+ else if (ptl->profile_idc == FF_PROFILE_HEVC_REXT)
-+ av_log(avctx, AV_LOG_DEBUG, "Range Extension profile bitstream\n");
-+ else
-+ av_log(avctx, AV_LOG_WARNING, "Unknown HEVC profile: %d\n", ptl->profile_idc);
-+
-+ for (i = 0; i < 32; i++) {
-+ ptl->profile_compatibility_flag[i] = get_bits1(gb);
-+
-+ if (ptl->profile_idc == 0 && i > 0 && ptl->profile_compatibility_flag[i])
-+ ptl->profile_idc = i;
-+ }
-+ ptl->progressive_source_flag = get_bits1(gb);
-+ ptl->interlaced_source_flag = get_bits1(gb);
-+ ptl->non_packed_constraint_flag = get_bits1(gb);
-+ ptl->frame_only_constraint_flag = get_bits1(gb);
-+
-+ skip_bits(gb, 16); // XXX_reserved_zero_44bits[0..15]
-+ skip_bits(gb, 16); // XXX_reserved_zero_44bits[16..31]
-+ skip_bits(gb, 12); // XXX_reserved_zero_44bits[32..43]
-+
-+ return 0;
-+}
-+
-+static int parse_ptl(GetBitContext * const gb, AVCodecContext * const avctx,
-+ PTL * const ptl, const int max_num_sub_layers)
-+{
-+ int i;
-+ if (decode_profile_tier_level(gb, avctx, &ptl->general_ptl) < 0 ||
-+ get_bits_left(gb) < 8 + (8*2 * (max_num_sub_layers - 1 > 0))) {
-+ av_log(avctx, AV_LOG_ERROR, "PTL information too short\n");
-+ return -1;
-+ }
-+
-+ ptl->general_ptl.level_idc = get_bits(gb, 8);
-+
-+ for (i = 0; i < max_num_sub_layers - 1; i++) {
-+ ptl->sub_layer_profile_present_flag[i] = get_bits1(gb);
-+ ptl->sub_layer_level_present_flag[i] = get_bits1(gb);
-+ }
-+
-+ if (max_num_sub_layers - 1> 0)
-+ for (i = max_num_sub_layers - 1; i < 8; i++)
-+ skip_bits(gb, 2); // reserved_zero_2bits[i]
-+ for (i = 0; i < max_num_sub_layers - 1; i++) {
-+ if (ptl->sub_layer_profile_present_flag[i] &&
-+ decode_profile_tier_level(gb, avctx, &ptl->sub_layer_ptl[i]) < 0) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "PTL information for sublayer %i too short\n", i);
-+ return -1;
-+ }
-+ if (ptl->sub_layer_level_present_flag[i]) {
-+ if (get_bits_left(gb) < 8) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Not enough data for sublayer %i level_idc\n", i);
-+ return -1;
-+ } else
-+ ptl->sub_layer_ptl[i].level_idc = get_bits(gb, 8);
-+ }
-+ }
-+
-+ return 0;
-+}
-+
-+static void decode_sublayer_hrd(GetBitContext * const gb, const unsigned int nb_cpb,
-+ const int subpic_params_present)
-+{
-+ int i;
-+
-+ for (i = 0; i < nb_cpb; i++) {
-+ get_ue_golomb_long(gb); // bit_rate_value_minus1
-+ get_ue_golomb_long(gb); // cpb_size_value_minus1
-+
-+ if (subpic_params_present) {
-+ get_ue_golomb_long(gb); // cpb_size_du_value_minus1
-+ get_ue_golomb_long(gb); // bit_rate_du_value_minus1
-+ }
-+ skip_bits1(gb); // cbr_flag
-+ }
-+}
-+
-+static int decode_hrd(GetBitContext * const gb, const int common_inf_present,
-+ const int max_sublayers)
-+{
-+ int nal_params_present = 0, vcl_params_present = 0;
-+ int subpic_params_present = 0;
-+ int i;
-+
-+ if (common_inf_present) {
-+ nal_params_present = get_bits1(gb);
-+ vcl_params_present = get_bits1(gb);
-+
-+ if (nal_params_present || vcl_params_present) {
-+ subpic_params_present = get_bits1(gb);
-+
-+ if (subpic_params_present) {
-+ skip_bits(gb, 8); // tick_divisor_minus2
-+ skip_bits(gb, 5); // du_cpb_removal_delay_increment_length_minus1
-+ skip_bits(gb, 1); // sub_pic_cpb_params_in_pic_timing_sei_flag
-+ skip_bits(gb, 5); // dpb_output_delay_du_length_minus1
-+ }
-+
-+ skip_bits(gb, 4); // bit_rate_scale
-+ skip_bits(gb, 4); // cpb_size_scale
-+
-+ if (subpic_params_present)
-+ skip_bits(gb, 4); // cpb_size_du_scale
-+
-+ skip_bits(gb, 5); // initial_cpb_removal_delay_length_minus1
-+ skip_bits(gb, 5); // au_cpb_removal_delay_length_minus1
-+ skip_bits(gb, 5); // dpb_output_delay_length_minus1
-+ }
-+ }
-+
-+ for (i = 0; i < max_sublayers; i++) {
-+ int low_delay = 0;
-+ unsigned int nb_cpb = 1;
-+ int fixed_rate = get_bits1(gb);
-+
-+ if (!fixed_rate)
-+ fixed_rate = get_bits1(gb);
-+
-+ if (fixed_rate)
-+ get_ue_golomb_long(gb); // elemental_duration_in_tc_minus1
-+ else
-+ low_delay = get_bits1(gb);
-+
-+ if (!low_delay) {
-+ nb_cpb = get_ue_golomb_long(gb) + 1;
-+ if (nb_cpb < 1 || nb_cpb > 32) {
-+ av_log(NULL, AV_LOG_ERROR, "nb_cpb %d invalid\n", nb_cpb);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+
-+ if (nal_params_present)
-+ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+ if (vcl_params_present)
-+ decode_sublayer_hrd(gb, nb_cpb, subpic_params_present);
-+ }
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext * const gb, AVCodecContext * const avctx,
-+ HEVCRpiParamSets * const ps)
-+{
-+ int i,j;
-+ int vps_id = 0;
-+ ptrdiff_t nal_size;
-+ HEVCRpiVPS *vps;
-+ AVBufferRef *vps_buf = av_buffer_allocz(sizeof(*vps));
-+
-+ if (!vps_buf)
-+ return AVERROR(ENOMEM);
-+ vps = (HEVCRpiVPS*)vps_buf->data;
-+
-+ av_log(avctx, AV_LOG_DEBUG, "Decoding VPS\n");
-+
-+ nal_size = gb->buffer_end - gb->buffer;
-+ if (nal_size > sizeof(vps->data)) {
-+ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized VPS "
-+ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+ nal_size, sizeof(vps->data));
-+ vps->data_size = sizeof(vps->data);
-+ } else {
-+ vps->data_size = nal_size;
-+ }
-+ memcpy(vps->data, gb->buffer, vps->data_size);
-+
-+ vps_id = get_bits(gb, 4);
-+ if (vps_id >= HEVC_MAX_VPS_COUNT) {
-+ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", vps_id);
-+ goto err;
-+ }
-+
-+ if (get_bits(gb, 2) != 3) { // vps_reserved_three_2bits
-+ av_log(avctx, AV_LOG_ERROR, "vps_reserved_three_2bits is not three\n");
-+ goto err;
-+ }
-+
-+ vps->vps_max_layers = get_bits(gb, 6) + 1;
-+ vps->vps_max_sub_layers = get_bits(gb, 3) + 1;
-+ vps->vps_temporal_id_nesting_flag = get_bits1(gb);
-+
-+ if (get_bits(gb, 16) != 0xffff) { // vps_reserved_ffff_16bits
-+ av_log(avctx, AV_LOG_ERROR, "vps_reserved_ffff_16bits is not 0xffff\n");
-+ goto err;
-+ }
-+
-+ if (vps->vps_max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+ av_log(avctx, AV_LOG_ERROR, "vps_max_sub_layers out of range: %d\n",
-+ vps->vps_max_sub_layers);
-+ goto err;
-+ }
-+
-+ if (parse_ptl(gb, avctx, &vps->ptl, vps->vps_max_sub_layers) < 0)
-+ goto err;
-+
-+ vps->vps_sub_layer_ordering_info_present_flag = get_bits1(gb);
-+
-+ i = vps->vps_sub_layer_ordering_info_present_flag ? 0 : vps->vps_max_sub_layers - 1;
-+ for (; i < vps->vps_max_sub_layers; i++) {
-+ vps->vps_max_dec_pic_buffering[i] = get_ue_golomb_long(gb) + 1;
-+ vps->vps_num_reorder_pics[i] = get_ue_golomb_long(gb);
-+ vps->vps_max_latency_increase[i] = get_ue_golomb_long(gb) - 1;
-+
-+ if (vps->vps_max_dec_pic_buffering[i] > HEVC_MAX_DPB_SIZE || !vps->vps_max_dec_pic_buffering[i]) {
-+ av_log(avctx, AV_LOG_ERROR, "vps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+ vps->vps_max_dec_pic_buffering[i] - 1);
-+ goto err;
-+ }
-+ if (vps->vps_num_reorder_pics[i] > vps->vps_max_dec_pic_buffering[i] - 1) {
-+ av_log(avctx, AV_LOG_WARNING, "vps_max_num_reorder_pics out of range: %d\n",
-+ vps->vps_num_reorder_pics[i]);
-+ if (avctx->err_recognition & AV_EF_EXPLODE)
-+ goto err;
-+ }
-+ }
-+
-+ vps->vps_max_layer_id = get_bits(gb, 6);
-+ vps->vps_num_layer_sets = get_ue_golomb_long(gb) + 1;
-+ if (vps->vps_num_layer_sets < 1 || vps->vps_num_layer_sets > 1024 ||
-+ (vps->vps_num_layer_sets - 1LL) * (vps->vps_max_layer_id + 1LL) > get_bits_left(gb)) {
-+ av_log(avctx, AV_LOG_ERROR, "too many layer_id_included_flags\n");
-+ goto err;
-+ }
-+
-+ for (i = 1; i < vps->vps_num_layer_sets; i++)
-+ for (j = 0; j <= vps->vps_max_layer_id; j++)
-+ skip_bits(gb, 1); // layer_id_included_flag[i][j]
-+
-+ vps->vps_timing_info_present_flag = get_bits1(gb);
-+ if (vps->vps_timing_info_present_flag) {
-+ vps->vps_num_units_in_tick = get_bits_long(gb, 32);
-+ vps->vps_time_scale = get_bits_long(gb, 32);
-+ vps->vps_poc_proportional_to_timing_flag = get_bits1(gb);
-+ if (vps->vps_poc_proportional_to_timing_flag)
-+ vps->vps_num_ticks_poc_diff_one = get_ue_golomb_long(gb) + 1;
-+ vps->vps_num_hrd_parameters = get_ue_golomb_long(gb);
-+ if (vps->vps_num_hrd_parameters > (unsigned)vps->vps_num_layer_sets) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "vps_num_hrd_parameters %d is invalid\n", vps->vps_num_hrd_parameters);
-+ goto err;
-+ }
-+ for (i = 0; i < vps->vps_num_hrd_parameters; i++) {
-+ int common_inf_present = 1;
-+
-+ get_ue_golomb_long(gb); // hrd_layer_set_idx
-+ if (i)
-+ common_inf_present = get_bits1(gb);
-+ decode_hrd(gb, common_inf_present, vps->vps_max_sub_layers);
-+ }
-+ }
-+ get_bits1(gb); /* vps_extension_flag */
-+
-+ if (get_bits_left(gb) < 0) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Overread VPS by %d bits\n", -get_bits_left(gb));
-+ if (ps->vps_list[vps_id])
-+ goto err;
-+ }
-+
-+ if (ps->vps_list[vps_id] &&
-+ !memcmp(ps->vps_list[vps_id]->data, vps_buf->data, vps_buf->size)) {
-+ av_buffer_unref(&vps_buf);
-+ } else {
-+ remove_vps(ps, vps_id);
-+ ps->vps_list[vps_id] = vps_buf;
-+ }
-+
-+ return 0;
-+
-+err:
-+ av_buffer_unref(&vps_buf);
-+ return AVERROR_INVALIDDATA;
-+}
-+
-+static void decode_vui(GetBitContext * const gb, AVCodecContext * const avctx,
-+ const int apply_defdispwin, HEVCRpiSPS * const sps)
-+{
-+ VUI backup_vui, * const vui = &sps->vui;
-+ GetBitContext backup;
-+ int sar_present, alt = 0;
-+
-+ av_log(avctx, AV_LOG_DEBUG, "Decoding VUI\n");
-+
-+ sar_present = get_bits1(gb);
-+ if (sar_present) {
-+ uint8_t sar_idx = get_bits(gb, 8);
-+ if (sar_idx < FF_ARRAY_ELEMS(vui_sar))
-+ vui->sar = vui_sar[sar_idx];
-+ else if (sar_idx == 255) {
-+ vui->sar.num = get_bits(gb, 16);
-+ vui->sar.den = get_bits(gb, 16);
-+ } else
-+ av_log(avctx, AV_LOG_WARNING,
-+ "Unknown SAR index: %u.\n", sar_idx);
-+ }
-+
-+ vui->overscan_info_present_flag = get_bits1(gb);
-+ if (vui->overscan_info_present_flag)
-+ vui->overscan_appropriate_flag = get_bits1(gb);
-+
-+ vui->video_signal_type_present_flag = get_bits1(gb);
-+ if (vui->video_signal_type_present_flag) {
-+ vui->video_format = get_bits(gb, 3);
-+ vui->video_full_range_flag = get_bits1(gb);
-+ vui->colour_description_present_flag = get_bits1(gb);
-+ if (vui->video_full_range_flag && sps->pix_fmt == AV_PIX_FMT_YUV420P)
-+ sps->pix_fmt = AV_PIX_FMT_YUVJ420P;
-+ if (vui->colour_description_present_flag) {
-+ vui->colour_primaries = get_bits(gb, 8);
-+ vui->transfer_characteristic = get_bits(gb, 8);
-+ vui->matrix_coeffs = get_bits(gb, 8);
-+
-+ // Set invalid values to "unspecified"
-+ if (!av_color_primaries_name(vui->colour_primaries))
-+ vui->colour_primaries = AVCOL_PRI_UNSPECIFIED;
-+ if (!av_color_transfer_name(vui->transfer_characteristic))
-+ vui->transfer_characteristic = AVCOL_TRC_UNSPECIFIED;
-+ if (!av_color_space_name(vui->matrix_coeffs))
-+ vui->matrix_coeffs = AVCOL_SPC_UNSPECIFIED;
-+ if (vui->matrix_coeffs == AVCOL_SPC_RGB) {
-+ switch (sps->pix_fmt) {
-+ case AV_PIX_FMT_YUV444P:
-+ sps->pix_fmt = AV_PIX_FMT_GBRP;
-+ break;
-+ case AV_PIX_FMT_YUV444P10:
-+ sps->pix_fmt = AV_PIX_FMT_GBRP10;
-+ break;
-+ case AV_PIX_FMT_YUV444P12:
-+ sps->pix_fmt = AV_PIX_FMT_GBRP12;
-+ break;
-+ }
-+ }
-+ }
-+ }
-+
-+ vui->chroma_loc_info_present_flag = get_bits1(gb);
-+ if (vui->chroma_loc_info_present_flag) {
-+ vui->chroma_sample_loc_type_top_field = get_ue_golomb_long(gb);
-+ vui->chroma_sample_loc_type_bottom_field = get_ue_golomb_long(gb);
-+ }
-+
-+ vui->neutra_chroma_indication_flag = get_bits1(gb);
-+ vui->field_seq_flag = get_bits1(gb);
-+ vui->frame_field_info_present_flag = get_bits1(gb);
-+
-+ // Backup context in case an alternate header is detected
-+ memcpy(&backup, gb, sizeof(backup));
-+ memcpy(&backup_vui, vui, sizeof(backup_vui));
-+ if (get_bits_left(gb) >= 68 && show_bits_long(gb, 21) == 0x100000) {
-+ vui->default_display_window_flag = 0;
-+ av_log(avctx, AV_LOG_WARNING, "Invalid default display window\n");
-+ } else
-+ vui->default_display_window_flag = get_bits1(gb);
-+
-+ if (vui->default_display_window_flag) {
-+ int vert_mult = 1 + (sps->chroma_format_idc < 2);
-+ int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+ vui->def_disp_win.left_offset = get_ue_golomb_long(gb) * horiz_mult;
-+ vui->def_disp_win.right_offset = get_ue_golomb_long(gb) * horiz_mult;
-+ vui->def_disp_win.top_offset = get_ue_golomb_long(gb) * vert_mult;
-+ vui->def_disp_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult;
-+
-+ if (apply_defdispwin &&
-+ avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+ av_log(avctx, AV_LOG_DEBUG,
-+ "discarding vui default display window, "
-+ "original values are l:%u r:%u t:%u b:%u\n",
-+ vui->def_disp_win.left_offset,
-+ vui->def_disp_win.right_offset,
-+ vui->def_disp_win.top_offset,
-+ vui->def_disp_win.bottom_offset);
-+
-+ vui->def_disp_win.left_offset =
-+ vui->def_disp_win.right_offset =
-+ vui->def_disp_win.top_offset =
-+ vui->def_disp_win.bottom_offset = 0;
-+ }
-+ }
-+
-+timing_info:
-+ vui->vui_timing_info_present_flag = get_bits1(gb);
-+
-+ if (vui->vui_timing_info_present_flag) {
-+ if( get_bits_left(gb) < 66 && !alt) {
-+ // The alternate syntax seem to have timing info located
-+ // at where def_disp_win is normally located
-+ av_log(avctx, AV_LOG_WARNING,
-+ "Strange VUI timing information, retrying...\n");
-+ memcpy(vui, &backup_vui, sizeof(backup_vui));
-+ memcpy(gb, &backup, sizeof(backup));
-+ alt = 1;
-+ goto timing_info;
-+ }
-+ vui->vui_num_units_in_tick = get_bits_long(gb, 32);
-+ vui->vui_time_scale = get_bits_long(gb, 32);
-+ if (alt) {
-+ av_log(avctx, AV_LOG_INFO, "Retry got %"PRIu32"/%"PRIu32" fps\n",
-+ vui->vui_time_scale, vui->vui_num_units_in_tick);
-+ }
-+ vui->vui_poc_proportional_to_timing_flag = get_bits1(gb);
-+ if (vui->vui_poc_proportional_to_timing_flag)
-+ vui->vui_num_ticks_poc_diff_one_minus1 = get_ue_golomb_long(gb);
-+ vui->vui_hrd_parameters_present_flag = get_bits1(gb);
-+ if (vui->vui_hrd_parameters_present_flag)
-+ decode_hrd(gb, 1, sps->max_sub_layers);
-+ }
-+
-+ vui->bitstream_restriction_flag = get_bits1(gb);
-+ if (vui->bitstream_restriction_flag) {
-+ if (get_bits_left(gb) < 8 && !alt) {
-+ av_log(avctx, AV_LOG_WARNING,
-+ "Strange VUI bitstream restriction information, retrying"
-+ " from timing information...\n");
-+ memcpy(vui, &backup_vui, sizeof(backup_vui));
-+ memcpy(gb, &backup, sizeof(backup));
-+ alt = 1;
-+ goto timing_info;
-+ }
-+ vui->tiles_fixed_structure_flag = get_bits1(gb);
-+ vui->motion_vectors_over_pic_boundaries_flag = get_bits1(gb);
-+ vui->restricted_ref_pic_lists_flag = get_bits1(gb);
-+ vui->min_spatial_segmentation_idc = get_ue_golomb_long(gb);
-+ vui->max_bytes_per_pic_denom = get_ue_golomb_long(gb);
-+ vui->max_bits_per_min_cu_denom = get_ue_golomb_long(gb);
-+ vui->log2_max_mv_length_horizontal = get_ue_golomb_long(gb);
-+ vui->log2_max_mv_length_vertical = get_ue_golomb_long(gb);
-+ }
-+
-+ if (get_bits_left(gb) < 1 && !alt) {
-+ // XXX: Alternate syntax when sps_range_extension_flag != 0?
-+ av_log(avctx, AV_LOG_WARNING,
-+ "Overread in VUI, retrying from timing information...\n");
-+ memcpy(vui, &backup_vui, sizeof(backup_vui));
-+ memcpy(gb, &backup, sizeof(backup));
-+ alt = 1;
-+ goto timing_info;
-+ }
-+}
-+
-+static void set_default_scaling_list_data(ScalingList * const sl)
-+{
-+ int matrixId;
-+
-+ for (matrixId = 0; matrixId < 6; matrixId++) {
-+ // 4x4 default is 16
-+ memset(sl->sl[0][matrixId], 16, 16);
-+ sl->sl_dc[0][matrixId] = 16; // default for 16x16
-+ sl->sl_dc[1][matrixId] = 16; // default for 32x32
-+ }
-+
-+ memcpy(sl->sl[1][0], default_scaling_list_intra, 64);
-+ memcpy(sl->sl[1][1], default_scaling_list_intra, 64);
-+ memcpy(sl->sl[1][2], default_scaling_list_intra, 64);
-+
-+ memcpy(sl->sl[1][3], default_scaling_list_inter, 64);
-+ memcpy(sl->sl[1][4], default_scaling_list_inter, 64);
-+ memcpy(sl->sl[1][5], default_scaling_list_inter, 64);
-+
-+ memcpy(sl->sl[2][0], default_scaling_list_intra, 64);
-+ memcpy(sl->sl[2][1], default_scaling_list_intra, 64);
-+ memcpy(sl->sl[2][2], default_scaling_list_intra, 64);
-+
-+ memcpy(sl->sl[2][3], default_scaling_list_inter, 64);
-+ memcpy(sl->sl[2][4], default_scaling_list_inter, 64);
-+ memcpy(sl->sl[2][5], default_scaling_list_inter, 64);
-+
-+ memcpy(sl->sl[3][0], default_scaling_list_intra, 64);
-+ memcpy(sl->sl[3][1], default_scaling_list_intra, 64);
-+ memcpy(sl->sl[3][2], default_scaling_list_intra, 64);
-+
-+ memcpy(sl->sl[3][3], default_scaling_list_inter, 64);
-+ memcpy(sl->sl[3][4], default_scaling_list_inter, 64);
-+ memcpy(sl->sl[3][5], default_scaling_list_inter, 64);
-+}
-+
-+static int scaling_list_data(GetBitContext * const gb, AVCodecContext * const avctx, ScalingList * const sl,
-+ const HEVCRpiSPS * const sps)
-+{
-+ uint8_t scaling_list_pred_mode_flag;
-+ int32_t scaling_list_dc_coef[2][6];
-+ int size_id, matrix_id, pos;
-+ int i;
-+
-+ for (size_id = 0; size_id < 4; size_id++)
-+ for (matrix_id = 0; matrix_id < 6; matrix_id += ((size_id == 3) ? 3 : 1)) {
-+ scaling_list_pred_mode_flag = get_bits1(gb);
-+ if (!scaling_list_pred_mode_flag) {
-+ unsigned int delta = get_ue_golomb_long(gb);
-+ /* Only need to handle non-zero delta. Zero means default,
-+ * which should already be in the arrays. */
-+ if (delta) {
-+ // Copy from previous array.
-+ delta *= (size_id == 3) ? 3 : 1;
-+ if (matrix_id < delta) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Invalid delta in scaling list data: %d.\n", delta);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ memcpy(sl->sl[size_id][matrix_id],
-+ sl->sl[size_id][matrix_id - delta],
-+ size_id > 0 ? 64 : 16);
-+ if (size_id > 1)
-+ sl->sl_dc[size_id - 2][matrix_id] = sl->sl_dc[size_id - 2][matrix_id - delta];
-+ }
-+ } else {
-+ int next_coef, coef_num;
-+ int32_t scaling_list_delta_coef;
-+
-+ next_coef = 8;
-+ coef_num = FFMIN(64, 1 << (4 + (size_id << 1)));
-+ if (size_id > 1) {
-+ scaling_list_dc_coef[size_id - 2][matrix_id] = get_se_golomb(gb) + 8;
-+ next_coef = scaling_list_dc_coef[size_id - 2][matrix_id];
-+ sl->sl_dc[size_id - 2][matrix_id] = next_coef;
-+ }
-+ for (i = 0; i < coef_num; i++) {
-+ if (size_id == 0)
-+ pos = 4 * ff_hevc_rpi_diag_scan4x4_y[i] +
-+ ff_hevc_rpi_diag_scan4x4_x[i];
-+ else
-+ pos = 8 * ff_hevc_rpi_diag_scan8x8_y[i] +
-+ ff_hevc_rpi_diag_scan8x8_x[i];
-+
-+ scaling_list_delta_coef = get_se_golomb(gb);
-+ next_coef = (next_coef + 256U + scaling_list_delta_coef) % 256;
-+ sl->sl[size_id][matrix_id][pos] = next_coef;
-+ }
-+ }
-+ }
-+
-+ if (sps->chroma_format_idc == 3) {
-+ for (i = 0; i < 64; i++) {
-+ sl->sl[3][1][i] = sl->sl[2][1][i];
-+ sl->sl[3][2][i] = sl->sl[2][2][i];
-+ sl->sl[3][4][i] = sl->sl[2][4][i];
-+ sl->sl[3][5][i] = sl->sl[2][5][i];
-+ }
-+ sl->sl_dc[1][1] = sl->sl_dc[0][1];
-+ sl->sl_dc[1][2] = sl->sl_dc[0][2];
-+ sl->sl_dc[1][4] = sl->sl_dc[0][4];
-+ sl->sl_dc[1][5] = sl->sl_dc[0][5];
-+ }
-+
-+
-+ return 0;
-+}
-+
-+static int map_pixel_format(HEVCRpiSPS * const sps)
-+{
-+ const int cfmt = sps->chroma_format_idc;
-+
-+ sps->pix_fmt = AV_PIX_FMT_NONE;
-+ switch (sps->bit_depth) {
-+ case 8:
-+ if (cfmt == 1)
-+ sps->pix_fmt = AV_PIX_FMT_SAND128;
-+ break;
-+ case 10:
-+ if (cfmt == 1)
-+ sps->pix_fmt = AV_PIX_FMT_SAND64_10;
-+ break;
-+ default:
-+ break;
-+ }
-+
-+ sps->hshift[0] = sps->vshift[0] = 0;
-+ sps->hshift[2] = sps->hshift[1] = cfmt > 2 ? 0 : 1; // 1 unless 4:4:4
-+ sps->vshift[2] = sps->vshift[1] = cfmt > 1 ? 0 : 1; // 1 unless 4:4:4 or 4:2:2
-+
-+ sps->pixel_shift = sps->bit_depth > 8 ? 1 : 0;
-+
-+ return 0;
-+}
-+
-+static int ff_hevc_rpi_parse_sps(HEVCRpiSPS * const sps, GetBitContext * const gb, unsigned int * const sps_id,
-+ const int apply_defdispwin, AVBufferRef * const * const vps_list, AVCodecContext * const avctx)
-+{
-+ HEVCRpiWindow *ow;
-+ int ret = 0;
-+ int log2_diff_max_min_transform_block_size;
-+ int bit_depth_chroma, start, vui_present, sublayer_ordering_info;
-+ int i;
-+
-+ // Coded parameters
-+
-+ sps->vps_id = get_bits(gb, 4);
-+ if (sps->vps_id >= HEVC_MAX_VPS_COUNT) {
-+ av_log(avctx, AV_LOG_ERROR, "VPS id out of range: %d\n", sps->vps_id);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (vps_list && !vps_list[sps->vps_id]) {
-+ av_log(avctx, AV_LOG_ERROR, "VPS %d does not exist\n",
-+ sps->vps_id);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sps->max_sub_layers = get_bits(gb, 3) + 1;
-+ if (sps->max_sub_layers > HEVC_MAX_SUB_LAYERS) {
-+ av_log(avctx, AV_LOG_ERROR, "sps_max_sub_layers out of range: %d\n",
-+ sps->max_sub_layers);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sps->temporal_id_nesting_flag = get_bits(gb, 1);
-+
-+ if ((ret = parse_ptl(gb, avctx, &sps->ptl, sps->max_sub_layers)) < 0)
-+ return ret;
-+
-+ *sps_id = get_ue_golomb_long(gb);
-+ if (*sps_id >= HEVC_MAX_SPS_COUNT) {
-+ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", *sps_id);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sps->chroma_format_idc = get_ue_golomb_long(gb);
-+ if (sps->chroma_format_idc > 3U) {
-+ av_log(avctx, AV_LOG_ERROR, "chroma_format_idc %d is invalid\n", sps->chroma_format_idc);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (sps->chroma_format_idc == 3)
-+ sps->separate_colour_plane_flag = get_bits1(gb);
-+
-+ if (sps->separate_colour_plane_flag)
-+ sps->chroma_format_idc = 0;
-+
-+ sps->width = get_ue_golomb_long(gb);
-+ sps->height = get_ue_golomb_long(gb);
-+ if ((ret = av_image_check_size(sps->width,
-+ sps->height, 0, avctx)) < 0)
-+ return ret;
-+
-+ if (get_bits1(gb)) { // pic_conformance_flag
-+ int vert_mult = 1 + (sps->chroma_format_idc < 2);
-+ int horiz_mult = 1 + (sps->chroma_format_idc < 3);
-+ sps->pic_conf_win.left_offset = get_ue_golomb_long(gb) * horiz_mult;
-+ sps->pic_conf_win.right_offset = get_ue_golomb_long(gb) * horiz_mult;
-+ sps->pic_conf_win.top_offset = get_ue_golomb_long(gb) * vert_mult;
-+ sps->pic_conf_win.bottom_offset = get_ue_golomb_long(gb) * vert_mult;
-+
-+ if (avctx->flags2 & AV_CODEC_FLAG2_IGNORE_CROP) {
-+ av_log(avctx, AV_LOG_DEBUG,
-+ "discarding sps conformance window, "
-+ "original values are l:%u r:%u t:%u b:%u\n",
-+ sps->pic_conf_win.left_offset,
-+ sps->pic_conf_win.right_offset,
-+ sps->pic_conf_win.top_offset,
-+ sps->pic_conf_win.bottom_offset);
-+
-+ sps->pic_conf_win.left_offset =
-+ sps->pic_conf_win.right_offset =
-+ sps->pic_conf_win.top_offset =
-+ sps->pic_conf_win.bottom_offset = 0;
-+ }
-+ sps->output_window = sps->pic_conf_win;
-+ }
-+
-+ sps->bit_depth = get_ue_golomb_long(gb) + 8;
-+ bit_depth_chroma = get_ue_golomb_long(gb) + 8;
-+ if (sps->chroma_format_idc && bit_depth_chroma != sps->bit_depth) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Luma bit depth (%d) is different from chroma bit depth (%d), "
-+ "this is unsupported.\n",
-+ sps->bit_depth, bit_depth_chroma);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ ret = map_pixel_format(sps);
-+ if (ret < 0)
-+ return ret;
-+
-+ sps->log2_max_poc_lsb = get_ue_golomb_long(gb) + 4;
-+ if (sps->log2_max_poc_lsb > 16) {
-+ av_log(avctx, AV_LOG_ERROR, "log2_max_pic_order_cnt_lsb_minus4 out range: %d\n",
-+ sps->log2_max_poc_lsb - 4);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sublayer_ordering_info = get_bits1(gb);
-+ start = sublayer_ordering_info ? 0 : sps->max_sub_layers - 1;
-+ for (i = start; i < sps->max_sub_layers; i++) {
-+ sps->temporal_layer[i].max_dec_pic_buffering = get_ue_golomb_long(gb) + 1;
-+ sps->temporal_layer[i].num_reorder_pics = get_ue_golomb_long(gb);
-+ sps->temporal_layer[i].max_latency_increase = get_ue_golomb_long(gb) - 1;
-+ if (sps->temporal_layer[i].max_dec_pic_buffering > (unsigned)HEVC_MAX_DPB_SIZE) {
-+ av_log(avctx, AV_LOG_ERROR, "sps_max_dec_pic_buffering_minus1 out of range: %d\n",
-+ sps->temporal_layer[i].max_dec_pic_buffering - 1U);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ if (sps->temporal_layer[i].num_reorder_pics > sps->temporal_layer[i].max_dec_pic_buffering - 1) {
-+ av_log(avctx, AV_LOG_WARNING, "sps_max_num_reorder_pics out of range: %d\n",
-+ sps->temporal_layer[i].num_reorder_pics);
-+ if (avctx->err_recognition & AV_EF_EXPLODE ||
-+ sps->temporal_layer[i].num_reorder_pics > HEVC_MAX_DPB_SIZE - 1) {
-+ return AVERROR_INVALIDDATA;
-+ }
-+ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[i].num_reorder_pics + 1;
-+ }
-+ }
-+
-+ if (!sublayer_ordering_info) {
-+ for (i = 0; i < start; i++) {
-+ sps->temporal_layer[i].max_dec_pic_buffering = sps->temporal_layer[start].max_dec_pic_buffering;
-+ sps->temporal_layer[i].num_reorder_pics = sps->temporal_layer[start].num_reorder_pics;
-+ sps->temporal_layer[i].max_latency_increase = sps->temporal_layer[start].max_latency_increase;
-+ }
-+ }
-+
-+ sps->log2_min_cb_size = get_ue_golomb_long(gb) + 3;
-+ sps->log2_diff_max_min_coding_block_size = get_ue_golomb_long(gb);
-+ sps->log2_min_tb_size = get_ue_golomb_long(gb) + 2;
-+ log2_diff_max_min_transform_block_size = get_ue_golomb_long(gb);
-+ sps->log2_max_trafo_size = log2_diff_max_min_transform_block_size +
-+ sps->log2_min_tb_size;
-+
-+ if (sps->log2_min_cb_size < 3 || sps->log2_min_cb_size > 30) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_min_cb_size", sps->log2_min_cb_size);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (sps->log2_diff_max_min_coding_block_size > 30) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_coding_block_size", sps->log2_diff_max_min_coding_block_size);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (sps->log2_min_tb_size >= sps->log2_min_cb_size || sps->log2_min_tb_size < 2) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid value for log2_min_tb_size");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (log2_diff_max_min_transform_block_size < 0 || log2_diff_max_min_transform_block_size > 30) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for log2_diff_max_min_transform_block_size", log2_diff_max_min_transform_block_size);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ {
-+ const unsigned int CtbLog2SizeY = sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size;
-+ // Not a bitstream limitation, but all profiles
-+ if (CtbLog2SizeY < 4 || CtbLog2SizeY > HEVC_MAX_LOG2_CTB_SIZE) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for CtbLog2SizeY", CtbLog2SizeY);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (sps->log2_max_trafo_size > FFMIN(5, CtbLog2SizeY)) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid value %d for MaxTbLog2SizeY", sps->log2_max_trafo_size);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ // Inferred parameters
-+ sps->log2_ctb_size = CtbLog2SizeY;
-+// sps->log2_min_pu_size = sps->log2_min_cb_size - 1;
-+ }
-+
-+ sps->max_transform_hierarchy_depth_inter = get_ue_golomb_long(gb);
-+ sps->max_transform_hierarchy_depth_intra = get_ue_golomb_long(gb);
-+
-+ sps->scaling_list_enable_flag = get_bits1(gb);
-+ if (sps->scaling_list_enable_flag) {
-+ set_default_scaling_list_data(&sps->scaling_list);
-+
-+ if (get_bits1(gb)) {
-+ ret = scaling_list_data(gb, avctx, &sps->scaling_list, sps);
-+ if (ret < 0)
-+ return ret;
-+ }
-+ }
-+
-+ sps->amp_enabled_flag = get_bits1(gb);
-+ sps->sao_enabled = get_bits1(gb);
-+
-+ // Set pcm defaults (0) so we don't have to test _enabled when we
-+ // want to use them
-+ memset(&sps->pcm, 0, sizeof(sps->pcm));
-+
-+ if (get_bits1(gb)) // pcm_enabled_flag
-+ {
-+ const unsigned int limit_max_pcm = FFMIN(5,
-+ sps->log2_min_cb_size + sps->log2_diff_max_min_coding_block_size);
-+ sps->pcm.bit_depth = get_bits(gb, 4) + 1;
-+ sps->pcm.bit_depth_chroma = get_bits(gb, 4) + 1;
-+ sps->pcm.log2_min_pcm_cb_size = get_ue_golomb_long(gb) + 3;
-+ sps->pcm.log2_max_pcm_cb_size = sps->pcm.log2_min_pcm_cb_size +
-+ get_ue_golomb_long(gb);
-+ if (FFMAX(sps->pcm.bit_depth, sps->pcm.bit_depth_chroma) > sps->bit_depth) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "PCM bit depth (%d, %d) is greater than normal bit depth (%d)\n",
-+ sps->pcm.bit_depth, sps->pcm.bit_depth_chroma, sps->bit_depth);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ if (sps->pcm.log2_min_pcm_cb_size < sps->log2_min_cb_size ||
-+ sps->pcm.log2_max_pcm_cb_size > limit_max_pcm) {
-+ av_log(avctx, AV_LOG_ERROR, "Bad PCM CB min/max size (%d->%d)",
-+ sps->pcm.log2_min_pcm_cb_size, sps->pcm.log2_max_pcm_cb_size);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sps->pcm.loop_filter_disable_flag = get_bits1(gb);
-+ }
-+
-+ // Could be based on min_pcm_cb_size but much easier logic if we just stick
-+ // with 8 (and costs us little)
-+ sps->pcm_width = (sps->width + 63) >> 6; // 8 for min size, 8 bits per byte - round up
-+ sps->pcm_height = (sps->height + 7) >> 3;
-+
-+ sps->nb_st_rps = get_ue_golomb_long(gb);
-+ if (sps->nb_st_rps > HEVC_MAX_SHORT_TERM_REF_PIC_SETS) {
-+ av_log(avctx, AV_LOG_ERROR, "Too many short term RPS: %d.\n",
-+ sps->nb_st_rps);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ for (i = 0; i < sps->nb_st_rps; i++) {
-+ if ((ret = ff_hevc_rpi_decode_short_term_rps(gb, avctx, &sps->st_rps[i],
-+ sps, 0)) < 0)
-+ return ret;
-+ }
-+
-+ sps->long_term_ref_pics_present_flag = get_bits1(gb);
-+ if (sps->long_term_ref_pics_present_flag) {
-+ sps->num_long_term_ref_pics_sps = get_ue_golomb_long(gb);
-+ if (sps->num_long_term_ref_pics_sps > HEVC_MAX_LONG_TERM_REF_PICS) {
-+ av_log(avctx, AV_LOG_ERROR, "num_long_term_ref_pics_sps %d is out of range.\n",
-+ sps->num_long_term_ref_pics_sps);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ for (i = 0; i < sps->num_long_term_ref_pics_sps; i++) {
-+ sps->lt_ref_pic_poc_lsb_sps[i] = get_bits(gb, sps->log2_max_poc_lsb);
-+ sps->used_by_curr_pic_lt_sps_flag[i] = get_bits1(gb);
-+ }
-+ }
-+
-+ sps->sps_temporal_mvp_enabled_flag = get_bits1(gb);
-+ sps->sps_strong_intra_smoothing_enable_flag = get_bits1(gb);
-+ sps->vui.sar = (AVRational){0, 1};
-+ vui_present = get_bits1(gb);
-+ if (vui_present)
-+ decode_vui(gb, avctx, apply_defdispwin, sps);
-+
-+ if (get_bits1(gb)) { // sps_extension_flag
-+ int sps_extension_flag[1];
-+ for (i = 0; i < 1; i++)
-+ sps_extension_flag[i] = get_bits1(gb);
-+ skip_bits(gb, 7); //sps_extension_7bits = get_bits(gb, 7);
-+ if (sps_extension_flag[0]) {
-+ int extended_precision_processing_flag;
-+ int cabac_bypass_alignment_enabled_flag;
-+
-+ sps->transform_skip_rotation_enabled_flag = get_bits1(gb);
-+ sps->transform_skip_context_enabled_flag = get_bits1(gb);
-+ sps->implicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+ sps->explicit_rdpcm_enabled_flag = get_bits1(gb);
-+
-+ extended_precision_processing_flag = get_bits1(gb);
-+ if (extended_precision_processing_flag)
-+ av_log(avctx, AV_LOG_WARNING,
-+ "extended_precision_processing_flag not yet implemented\n");
-+
-+ sps->intra_smoothing_disabled_flag = get_bits1(gb);
-+ sps->high_precision_offsets_enabled_flag = get_bits1(gb);
-+ sps->persistent_rice_adaptation_enabled_flag = get_bits1(gb);
-+
-+ cabac_bypass_alignment_enabled_flag = get_bits1(gb);
-+ if (cabac_bypass_alignment_enabled_flag)
-+ av_log(avctx, AV_LOG_WARNING,
-+ "cabac_bypass_alignment_enabled_flag not yet implemented\n");
-+ }
-+ }
-+ if (apply_defdispwin) {
-+ sps->output_window.left_offset += sps->vui.def_disp_win.left_offset;
-+ sps->output_window.right_offset += sps->vui.def_disp_win.right_offset;
-+ sps->output_window.top_offset += sps->vui.def_disp_win.top_offset;
-+ sps->output_window.bottom_offset += sps->vui.def_disp_win.bottom_offset;
-+ }
-+
-+ ow = &sps->output_window;
-+ if (ow->left_offset >= INT_MAX - ow->right_offset ||
-+ ow->top_offset >= INT_MAX - ow->bottom_offset ||
-+ ow->left_offset + ow->right_offset >= sps->width ||
-+ ow->top_offset + ow->bottom_offset >= sps->height) {
-+ av_log(avctx, AV_LOG_WARNING, "Invalid cropping offsets: %u/%u/%u/%u\n",
-+ ow->left_offset, ow->right_offset, ow->top_offset, ow->bottom_offset);
-+ if (avctx->err_recognition & AV_EF_EXPLODE) {
-+ return AVERROR_INVALIDDATA;
-+ }
-+ av_log(avctx, AV_LOG_WARNING,
-+ "Displaying the whole video surface.\n");
-+ memset(ow, 0, sizeof(*ow));
-+ memset(&sps->pic_conf_win, 0, sizeof(sps->pic_conf_win));
-+ }
-+
-+ // Inferred parameters
-+
-+ sps->ctb_width = (sps->width + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+ sps->ctb_height = (sps->height + (1 << sps->log2_ctb_size) - 1) >> sps->log2_ctb_size;
-+ sps->ctb_size = sps->ctb_width * sps->ctb_height;
-+
-+ sps->min_cb_width = sps->width >> sps->log2_min_cb_size;
-+ sps->min_cb_height = sps->height >> sps->log2_min_cb_size;
-+ sps->min_tb_width = sps->width >> sps->log2_min_tb_size;
-+ sps->min_tb_height = sps->height >> sps->log2_min_tb_size;
-+ sps->min_pu_width = sps->width >> LOG2_MIN_PU_SIZE;
-+ sps->min_pu_height = sps->height >> LOG2_MIN_PU_SIZE;
-+ sps->tb_mask = (1 << (sps->log2_ctb_size - sps->log2_min_tb_size)) - 1;
-+
-+ sps->qp_bd_offset = 6 * (sps->bit_depth - 8);
-+ sps->wp_offset_half_range = (1U << (sps->high_precision_offsets_enabled_flag ? sps->bit_depth - 1 : 7));
-+
-+ if (av_mod_uintp2(sps->width, sps->log2_min_cb_size) ||
-+ av_mod_uintp2(sps->height, sps->log2_min_cb_size)) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid coded frame dimensions.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (sps->max_transform_hierarchy_depth_inter > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_inter out of range: %d\n",
-+ sps->max_transform_hierarchy_depth_inter);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ if (sps->max_transform_hierarchy_depth_intra > sps->log2_ctb_size - sps->log2_min_tb_size) {
-+ av_log(avctx, AV_LOG_ERROR, "max_transform_hierarchy_depth_intra out of range: %d\n",
-+ sps->max_transform_hierarchy_depth_intra);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ if (sps->log2_max_trafo_size > FFMIN(sps->log2_ctb_size, 5)) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "max transform block size out of range: %d\n",
-+ sps->log2_max_trafo_size);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (get_bits_left(gb) < 0) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Overread SPS by %d bits\n", -get_bits_left(gb));
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+ HEVCRpiParamSets *ps, int apply_defdispwin)
-+{
-+ HEVCRpiSPS *sps;
-+ AVBufferRef *sps_buf = av_buffer_allocz(sizeof(*sps));
-+ unsigned int sps_id;
-+ int ret;
-+ ptrdiff_t nal_size;
-+
-+ if (!sps_buf)
-+ return AVERROR(ENOMEM);
-+ sps = (HEVCRpiSPS*)sps_buf->data;
-+
-+ av_log(avctx, AV_LOG_DEBUG, "Decoding SPS\n");
-+
-+ nal_size = gb->buffer_end - gb->buffer;
-+ if (nal_size > sizeof(sps->data)) {
-+ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized SPS "
-+ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+ nal_size, sizeof(sps->data));
-+ sps->data_size = sizeof(sps->data);
-+ } else {
-+ sps->data_size = nal_size;
-+ }
-+ memcpy(sps->data, gb->buffer, sps->data_size);
-+
-+ ret = ff_hevc_rpi_parse_sps(sps, gb, &sps_id,
-+ apply_defdispwin,
-+ ps->vps_list, avctx);
-+ if (ret < 0) {
-+ av_buffer_unref(&sps_buf);
-+ return ret;
-+ }
-+
-+ if (avctx->debug & FF_DEBUG_BITSTREAM) {
-+ av_log(avctx, AV_LOG_DEBUG,
-+ "Parsed SPS: id %d; coded wxh: %dx%d; "
-+ "cropped wxh: %dx%d; pix_fmt: %s.\n",
-+ sps_id, sps->width, sps->height,
-+ sps->width - (sps->output_window.left_offset + sps->output_window.right_offset),
-+ sps->height - (sps->output_window.top_offset + sps->output_window.bottom_offset),
-+ av_get_pix_fmt_name(sps->pix_fmt));
-+ }
-+
-+ /* check if this is a repeat of an already parsed SPS, then keep the
-+ * original one.
-+ * otherwise drop all PPSes that depend on it */
-+ if (ps->sps_list[sps_id] &&
-+ !memcmp(ps->sps_list[sps_id]->data, sps_buf->data, sps_buf->size)) {
-+ av_buffer_unref(&sps_buf);
-+ } else {
-+ remove_sps(ps, sps_id);
-+ ps->sps_list[sps_id] = sps_buf;
-+ }
-+
-+ return 0;
-+}
-+
-+static void hevc_pps_free(void *opaque, uint8_t *data)
-+{
-+ HEVCRpiPPS *pps = (HEVCRpiPPS*)data;
-+
-+ av_freep(&pps->column_width);
-+ av_freep(&pps->row_height);
-+ av_freep(&pps->col_bd);
-+ av_freep(&pps->row_bd);
-+ av_freep(&pps->col_idxX);
-+ av_freep(&pps->ctb_addr_rs_to_ts);
-+ av_freep(&pps->ctb_addr_ts_to_rs);
-+ av_freep(&pps->tile_pos_ts);
-+ av_freep(&pps->tile_size);
-+ av_freep(&pps->tile_id);
-+ av_freep(&pps->ctb_ts_flags);
-+
-+ av_freep(&pps);
-+}
-+
-+static int get_offset_list(GetBitContext * const gb, AVCodecContext * const avctx, unsigned int n_minus_1, int8_t * offsets)
-+{
-+ do
-+ {
-+ const int offset = get_se_golomb_long(gb);
-+ if (offset < -12 || offset > 12) {
-+ av_log(avctx, AV_LOG_ERROR, "qp_offset_list[]: %d out of range\n", offset);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ *offsets++ = offset;
-+ } while (n_minus_1-- != 0);
-+ return 0;
-+}
-+
-+static int pps_range_extensions(GetBitContext * const gb, AVCodecContext * const avctx,
-+ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+ if (pps->transform_skip_enabled_flag) {
-+ pps->log2_max_transform_skip_block_size = get_ue_golomb_long(gb) + 2;
-+ }
-+ pps->cross_component_prediction_enabled_flag = get_bits1(gb);
-+ if (pps->cross_component_prediction_enabled_flag &&
-+ (sps->chroma_format_idc != 3 || sps->separate_colour_plane_flag))
-+ {
-+ av_log(avctx, AV_LOG_ERROR, "cross_component_prediction_enabled but chroma_format_idc != 3\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ pps->chroma_qp_offset_list_enabled_flag = get_bits1(gb);
-+ if (pps->chroma_qp_offset_list_enabled_flag) {
-+ int err;
-+
-+ pps->diff_cu_chroma_qp_offset_depth = get_ue_golomb_long(gb);
-+ pps->chroma_qp_offset_list_len_minus1 = get_ue_golomb_long(gb);
-+ if (pps->chroma_qp_offset_list_len_minus1 > 5) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "chroma_qp_offset_list_len_minus1 shall be in the range [0, 5].\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ av_log(avctx, AV_LOG_WARNING, "cb_qp_offset_list not tested yet.\n");
-+
-+ if ((err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cb_qp_offset_list)) != 0 ||
-+ (err = get_offset_list(gb, avctx, pps->chroma_qp_offset_list_len_minus1, pps->cr_qp_offset_list)) != 0)
-+ return err;
-+ }
-+
-+ {
-+ const unsigned int max_offset = sps->bit_depth > 10 ? sps->bit_depth - 10 : 0;
-+
-+ pps->log2_sao_offset_scale_luma = get_ue_golomb_long(gb);
-+ if (pps->log2_sao_offset_scale_luma > max_offset) {
-+ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_luma invalid");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ pps->log2_sao_offset_scale_chroma = get_ue_golomb_long(gb);
-+ if (pps->log2_sao_offset_scale_chroma > max_offset) {
-+ av_log(avctx, AV_LOG_ERROR, "log2_sao_offset_scale_chroma invalid");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+
-+ return(0);
-+}
-+
-+static inline int setup_pps(AVCodecContext * const avctx,
-+ HEVCRpiPPS * const pps, const HEVCRpiSPS * const sps)
-+{
-+ int pic_area_in_ctbs;
-+ int i, j, x, y, ctb_addr_rs, tile_id;
-+
-+ // Inferred parameters
-+
-+ // qp_y -> qp_u/qp_v tables
-+ // The tables have at least -24,+24 overrun after adding offset here
-+ // which should allow for clipless offseting
-+
-+ pps->qp_dblk_x[0] = qp_c_dblk_0 + QP_DBLK_OFFSET_0; // No offset for luma, but may be useful for general code
-+ pps->qp_bd_x[0] = qp_c_bd_0[sps->bit_depth - 8] + QP_OFFSET_0;
-+
-+ if (sps->chroma_format_idc == 1) {
-+ pps->qp_dblk_x[1] = qp_c_dblk_1 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+ pps->qp_bd_x[1] = qp_c_bd_1[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+ pps->qp_dblk_x[2] = qp_c_dblk_1 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+ pps->qp_bd_x[2] = qp_c_bd_1[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+ }
-+ else
-+ {
-+ pps->qp_dblk_x[1] = qp_c_dblk_0 + pps->cb_qp_offset + QP_DBLK_OFFSET_0;
-+ pps->qp_bd_x[1] = qp_c_bd_0[sps->bit_depth - 8] + pps->cb_qp_offset + QP_OFFSET_0;
-+ pps->qp_dblk_x[2] = qp_c_dblk_0 + pps->cr_qp_offset + QP_DBLK_OFFSET_0;
-+ pps->qp_bd_x[2] = qp_c_bd_0[sps->bit_depth - 8] + pps->cr_qp_offset + QP_OFFSET_0;
-+ }
-+
-+ pps->col_bd = av_malloc_array(pps->num_tile_columns + 1, sizeof(*pps->col_bd));
-+ pps->row_bd = av_malloc_array(pps->num_tile_rows + 1, sizeof(*pps->row_bd));
-+ pps->col_idxX = av_malloc_array(sps->ctb_width, sizeof(*pps->col_idxX));
-+ if (!pps->col_bd || !pps->row_bd || !pps->col_idxX)
-+ return AVERROR(ENOMEM);
-+
-+ if (pps->uniform_spacing_flag) {
-+ if (!pps->column_width) {
-+ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height));
-+ }
-+ if (!pps->column_width || !pps->row_height)
-+ return AVERROR(ENOMEM);
-+
-+ for (i = 0; i < pps->num_tile_columns; i++) {
-+ pps->column_width[i] = ((i + 1) * sps->ctb_width) / pps->num_tile_columns -
-+ (i * sps->ctb_width) / pps->num_tile_columns;
-+ }
-+
-+ for (i = 0; i < pps->num_tile_rows; i++) {
-+ pps->row_height[i] = ((i + 1) * sps->ctb_height) / pps->num_tile_rows -
-+ (i * sps->ctb_height) / pps->num_tile_rows;
-+ }
-+ }
-+
-+ {
-+ const unsigned int td_mask = 63 >> (sps->log2_ctb_size + sps->pixel_shift);
-+ pps->col_bd[0] = 0;
-+ pps->tile_wpp_inter_disable = 0;
-+ for (i = 0; i < pps->num_tile_columns; i++)
-+ {
-+ pps->col_bd[i + 1] = pps->col_bd[i] + pps->column_width[i];
-+
-+ // Avoid trying tile parallel if the columns don't fall on cache boundries
-+ // (this causes too much pain syncing flushes with the QPU)
-+ // Ignore the final (RHS of pic) tile boundry
-+ if ((pps->col_bd[i] & td_mask) != 0) {
-+ pps->tile_wpp_inter_disable = 1;
-+ }
-+ }
-+
-+ // If we can start the next row before finishing the first line of
-+ // this one then we must wait at the end of the tile
-+ // * if this happens a lot then there are better but more complicated
-+ // conditions that we could apply
-+ if (pps->tile_wpp_inter_disable) {
-+ for (i = 0; i < pps->num_tile_rows; i++)
-+ {
-+ if (pps->row_height[i] <= RPI_MAX_JOBS) {
-+ pps->tile_wpp_inter_disable = 2;
-+ break;
-+ }
-+ }
-+ }
-+ }
-+
-+ pps->row_bd[0] = 0;
-+ for (i = 0; i < pps->num_tile_rows; i++)
-+ pps->row_bd[i + 1] = pps->row_bd[i] + pps->row_height[i];
-+
-+ for (i = 0, j = 0; i < sps->ctb_width; i++) {
-+ if (i >= pps->col_bd[j + 1])
-+ j++;
-+ pps->col_idxX[i] = j;
-+ }
-+
-+ /**
-+ * 6.5
-+ */
-+ pic_area_in_ctbs = sps->ctb_size;
-+
-+ pps->ctb_addr_rs_to_ts = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_rs_to_ts));
-+ pps->ctb_addr_ts_to_rs = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_addr_ts_to_rs));
-+ pps->tile_id = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->tile_id));
-+ pps->tile_size = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_size));
-+ pps->tile_pos_ts = av_malloc_array(pps->num_tile_columns * pps->num_tile_rows, sizeof(*pps->tile_pos_ts));
-+ pps->ctb_ts_flags = av_malloc_array(pic_area_in_ctbs, sizeof(*pps->ctb_ts_flags));
-+ if (!pps->ctb_addr_rs_to_ts || !pps->ctb_addr_ts_to_rs ||
-+ !pps->tile_id || pps->tile_pos_ts == NULL || pps->tile_size == NULL) {
-+ return AVERROR(ENOMEM);
-+ }
-+
-+ memset(pps->ctb_ts_flags, 0, pic_area_in_ctbs * sizeof(*pps->ctb_ts_flags));
-+
-+ for (ctb_addr_rs = 0; ctb_addr_rs < pic_area_in_ctbs; ctb_addr_rs++) {
-+ int tb_x = ctb_addr_rs % sps->ctb_width;
-+ int tb_y = ctb_addr_rs / sps->ctb_width;
-+ int tile_x = 0;
-+ int tile_y = 0;
-+ int val = 0;
-+
-+ for (i = 0; i < pps->num_tile_columns; i++) {
-+ if (tb_x < pps->col_bd[i + 1]) {
-+ tile_x = i;
-+ break;
-+ }
-+ }
-+
-+ for (i = 0; i < pps->num_tile_rows; i++) {
-+ if (tb_y < pps->row_bd[i + 1]) {
-+ tile_y = i;
-+ break;
-+ }
-+ }
-+
-+ for (i = 0; i < tile_x; i++)
-+ val += pps->row_height[tile_y] * pps->column_width[i];
-+ for (i = 0; i < tile_y; i++)
-+ val += sps->ctb_width * pps->row_height[i];
-+
-+ val += (tb_y - pps->row_bd[tile_y]) * pps->column_width[tile_x] +
-+ tb_x - pps->col_bd[tile_x];
-+
-+ pps->ctb_addr_rs_to_ts[ctb_addr_rs] = val;
-+ pps->ctb_addr_ts_to_rs[val] = ctb_addr_rs;
-+ }
-+
-+ {
-+ uint8_t * pflags = pps->ctb_ts_flags;
-+ uint16_t * ptid = pps->tile_id;
-+
-+ for (j = 0, tile_id = 0; j < pps->num_tile_rows; j++)
-+ {
-+ for (i = 0; i < pps->num_tile_columns; i++, tile_id++)
-+ {
-+ const unsigned int tile_w = pps->column_width[i];
-+
-+ pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+ for (x = 0; x != tile_w; ++x) {
-+ pflags[x] |= CTB_TS_FLAGS_TOT;
-+ }
-+
-+ for (y = pps->row_bd[j]; y < pps->row_bd[j + 1]; y++)
-+ {
-+ pflags[0] |= CTB_TS_FLAGS_SOTL;
-+
-+ if (pps->entropy_coding_sync_enabled_flag)
-+ {
-+ if (pps->column_width[i] != 1)
-+ pflags[1] |= CTB_TS_FLAGS_CSAVE;
-+ else
-+ pflags[0] |= CTB_TS_FLAGS_CIREQ;
-+
-+ if ((pflags[0] & CTB_TS_FLAGS_CIREQ) == 0)
-+ pflags[0] |= CTB_TS_FLAGS_CLOAD;
-+ }
-+
-+ for (x = 0; x != tile_w; ++x)
-+ *ptid++ = tile_id;
-+
-+ pflags += tile_w;
-+ pflags[-1] |= CTB_TS_FLAGS_EOTL;
-+ if (i + 1 == pps->num_tile_columns)
-+ pflags[-1] |= CTB_TS_FLAGS_EOL;
-+ }
-+
-+ pflags[-1] |= CTB_TS_FLAGS_EOT;
-+ }
-+ }
-+ }
-+
-+ {
-+ unsigned int ts = 0;
-+ for (j = 0; j < pps->num_tile_rows; j++)
-+ for (i = 0; i < pps->num_tile_columns; i++)
-+ {
-+ const unsigned int size = pps->column_width[i] * pps->row_height[j];
-+ pps->tile_size[j * pps->num_tile_columns + i] = size;
-+ pps->tile_pos_ts[j * pps->num_tile_columns + i] = ts;
-+ ts += size;
-+ }
-+ }
-+
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext * const gb, AVCodecContext * const avctx,
-+ HEVCRpiParamSets * const ps)
-+{
-+ const HEVCRpiSPS *sps = NULL;
-+ int i, ret = 0;
-+ unsigned int pps_id = 0;
-+ ptrdiff_t nal_size;
-+ unsigned log2_parallel_merge_level_minus2;
-+
-+ AVBufferRef *pps_buf;
-+ HEVCRpiPPS *pps = av_mallocz(sizeof(*pps));
-+
-+ if (!pps)
-+ return AVERROR(ENOMEM);
-+
-+ pps_buf = av_buffer_create((uint8_t *)pps, sizeof(*pps),
-+ hevc_pps_free, NULL, 0);
-+ if (!pps_buf) {
-+ av_freep(&pps);
-+ return AVERROR(ENOMEM);
-+ }
-+
-+ av_log(avctx, AV_LOG_DEBUG, "Decoding PPS\n");
-+
-+ nal_size = gb->buffer_end - gb->buffer;
-+ if (nal_size > sizeof(pps->data)) {
-+ av_log(avctx, AV_LOG_WARNING, "Truncating likely oversized PPS "
-+ "(%"PTRDIFF_SPECIFIER" > %"SIZE_SPECIFIER")\n",
-+ nal_size, sizeof(pps->data));
-+ pps->data_size = sizeof(pps->data);
-+ } else {
-+ pps->data_size = nal_size;
-+ }
-+ memcpy(pps->data, gb->buffer, pps->data_size);
-+
-+ // Default values
-+ pps->loop_filter_across_tiles_enabled_flag = 1;
-+ pps->num_tile_columns = 1;
-+ pps->num_tile_rows = 1;
-+ pps->uniform_spacing_flag = 1;
-+ pps->disable_dbf = 0;
-+ pps->beta_offset = 0;
-+ pps->tc_offset = 0;
-+ pps->log2_max_transform_skip_block_size = 2;
-+
-+ // Coded parameters
-+ pps_id = get_ue_golomb_long(gb);
-+ if (pps_id >= HEVC_MAX_PPS_COUNT) {
-+ av_log(avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", pps_id);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->sps_id = get_ue_golomb_long(gb);
-+ if (pps->sps_id >= HEVC_MAX_SPS_COUNT) {
-+ av_log(avctx, AV_LOG_ERROR, "SPS id out of range: %d\n", pps->sps_id);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ if (!ps->sps_list[pps->sps_id]) {
-+ av_log(avctx, AV_LOG_ERROR, "SPS %u does not exist.\n", pps->sps_id);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ sps = (HEVCRpiSPS *)ps->sps_list[pps->sps_id]->data;
-+
-+ pps->dependent_slice_segments_enabled_flag = get_bits1(gb);
-+ pps->output_flag_present_flag = get_bits1(gb);
-+ pps->num_extra_slice_header_bits = get_bits(gb, 3);
-+
-+ pps->sign_data_hiding_flag = get_bits1(gb);
-+
-+ pps->cabac_init_present_flag = get_bits1(gb);
-+
-+ pps->num_ref_idx_l0_default_active = get_ue_golomb_long(gb) + 1;
-+ if (pps->num_ref_idx_l0_default_active < 1 || pps->num_ref_idx_l0_default_active > 15) {
-+ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l0_default_active invalid\n");
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->num_ref_idx_l1_default_active = get_ue_golomb_long(gb) + 1;
-+ if (pps->num_ref_idx_l1_default_active < 1 || pps->num_ref_idx_l1_default_active > 15) {
-+ av_log(avctx, AV_LOG_ERROR, "pps->num_ref_idx_l1_default_active invalid\n");
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+
-+ pps->pic_init_qp_minus26 = get_se_golomb(gb);
-+ if (pps->pic_init_qp_minus26 > 25 || pps->pic_init_qp_minus26 < -(26 + sps->qp_bd_offset)) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "init_qp_minus26 %d is outside the valid range "
-+ "[%d, %d].\n",
-+ pps->pic_init_qp_minus26,
-+ -(26 + sps->qp_bd_offset), 25);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+
-+ pps->constrained_intra_pred_flag = get_bits1(gb);
-+ pps->transform_skip_enabled_flag = get_bits1(gb);
-+
-+ pps->cu_qp_delta_enabled_flag = get_bits1(gb);
-+ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size;
-+ if (pps->cu_qp_delta_enabled_flag)
-+ {
-+ const unsigned int diff_cu_qp_delta_depth = get_ue_golomb_long(gb);
-+
-+ if (diff_cu_qp_delta_depth > sps->log2_diff_max_min_coding_block_size) {
-+ av_log(avctx, AV_LOG_ERROR, "diff_cu_qp_delta_depth %d is invalid\n",
-+ diff_cu_qp_delta_depth);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+
-+ pps->log2_min_cu_qp_delta_size = sps->log2_ctb_size - diff_cu_qp_delta_depth;
-+ }
-+
-+ pps->cb_qp_offset = get_se_golomb(gb);
-+ if (pps->cb_qp_offset < -12 || pps->cb_qp_offset > 12) {
-+ av_log(avctx, AV_LOG_ERROR, "pps_cb_qp_offset out of range: %d\n",
-+ pps->cb_qp_offset);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->cr_qp_offset = get_se_golomb(gb);
-+ if (pps->cr_qp_offset < -12 || pps->cr_qp_offset > 12) {
-+ av_log(avctx, AV_LOG_ERROR, "pps_cr_qp_offset out of range: %d\n",
-+ pps->cr_qp_offset);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->pic_slice_level_chroma_qp_offsets_present_flag = get_bits1(gb);
-+
-+ pps->weighted_pred_flag = get_bits1(gb);
-+ pps->weighted_bipred_flag = get_bits1(gb);
-+
-+ pps->transquant_bypass_enable_flag = get_bits1(gb);
-+ pps->tiles_enabled_flag = get_bits1(gb);
-+ pps->entropy_coding_sync_enabled_flag = get_bits1(gb);
-+
-+ if (pps->tiles_enabled_flag) {
-+ pps->num_tile_columns = get_ue_golomb_long(gb) + 1;
-+ pps->num_tile_rows = get_ue_golomb_long(gb) + 1;
-+ if (pps->num_tile_columns <= 0 ||
-+ pps->num_tile_columns >= sps->width) {
-+ av_log(avctx, AV_LOG_ERROR, "num_tile_columns_minus1 out of range: %d\n",
-+ pps->num_tile_columns - 1);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ if (pps->num_tile_rows <= 0 ||
-+ pps->num_tile_rows >= sps->height) {
-+ av_log(avctx, AV_LOG_ERROR, "num_tile_rows_minus1 out of range: %d\n",
-+ pps->num_tile_rows - 1);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+
-+ pps->column_width = av_malloc_array(pps->num_tile_columns, sizeof(*pps->column_width));
-+ pps->row_height = av_malloc_array(pps->num_tile_rows, sizeof(*pps->row_height));
-+ if (!pps->column_width || !pps->row_height) {
-+ ret = AVERROR(ENOMEM);
-+ goto err;
-+ }
-+
-+ pps->uniform_spacing_flag = get_bits1(gb);
-+ if (!pps->uniform_spacing_flag) {
-+ uint64_t sum = 0;
-+ for (i = 0; i < pps->num_tile_columns - 1; i++) {
-+ pps->column_width[i] = get_ue_golomb_long(gb) + 1;
-+ sum += pps->column_width[i];
-+ }
-+ if (sum >= sps->ctb_width) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid tile widths.\n");
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->column_width[pps->num_tile_columns - 1] = sps->ctb_width - sum;
-+
-+ sum = 0;
-+ for (i = 0; i < pps->num_tile_rows - 1; i++) {
-+ pps->row_height[i] = get_ue_golomb_long(gb) + 1;
-+ sum += pps->row_height[i];
-+ }
-+ if (sum >= sps->ctb_height) {
-+ av_log(avctx, AV_LOG_ERROR, "Invalid tile heights.\n");
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->row_height[pps->num_tile_rows - 1] = sps->ctb_height - sum;
-+ }
-+ pps->loop_filter_across_tiles_enabled_flag = get_bits1(gb);
-+ }
-+
-+ pps->seq_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+
-+ pps->deblocking_filter_control_present_flag = get_bits1(gb);
-+ if (pps->deblocking_filter_control_present_flag) {
-+ pps->deblocking_filter_override_enabled_flag = get_bits1(gb);
-+ pps->disable_dbf = get_bits1(gb);
-+ if (!pps->disable_dbf) {
-+ int beta_offset_div2 = get_se_golomb(gb);
-+ int tc_offset_div2 = get_se_golomb(gb) ;
-+ if (beta_offset_div2 < -6 || beta_offset_div2 > 6) {
-+ av_log(avctx, AV_LOG_ERROR, "pps_beta_offset_div2 out of range: %d\n",
-+ beta_offset_div2);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ if (tc_offset_div2 < -6 || tc_offset_div2 > 6) {
-+ av_log(avctx, AV_LOG_ERROR, "pps_tc_offset_div2 out of range: %d\n",
-+ tc_offset_div2);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->beta_offset = 2 * beta_offset_div2;
-+ pps->tc_offset = 2 * tc_offset_div2;
-+ }
-+ }
-+
-+ pps->scaling_list_data_present_flag = get_bits1(gb);
-+ if (pps->scaling_list_data_present_flag) {
-+ set_default_scaling_list_data(&pps->scaling_list);
-+ ret = scaling_list_data(gb, avctx, &pps->scaling_list, sps);
-+ if (ret < 0)
-+ goto err;
-+ }
-+ pps->lists_modification_present_flag = get_bits1(gb);
-+ log2_parallel_merge_level_minus2 = get_ue_golomb_long(gb);
-+ if (log2_parallel_merge_level_minus2 > sps->log2_ctb_size) {
-+ av_log(avctx, AV_LOG_ERROR, "log2_parallel_merge_level_minus2 out of range: %d\n",
-+ log2_parallel_merge_level_minus2);
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+ pps->log2_parallel_merge_level = log2_parallel_merge_level_minus2 + 2;
-+
-+ pps->slice_header_extension_present_flag = get_bits1(gb);
-+
-+ if (get_bits1(gb)) { // pps_extension_present_flag
-+ int pps_range_extensions_flag = get_bits1(gb);
-+ /* int pps_extension_7bits = */ get_bits(gb, 7);
-+ if (sps->ptl.general_ptl.profile_idc == FF_PROFILE_HEVC_REXT && pps_range_extensions_flag) {
-+ if ((ret = pps_range_extensions(gb, avctx, pps, sps)) < 0)
-+ goto err;
-+ }
-+ }
-+
-+ ret = setup_pps(avctx, pps, sps);
-+ if (ret < 0)
-+ goto err;
-+
-+ if (get_bits_left(gb) < 0) {
-+ av_log(avctx, AV_LOG_ERROR,
-+ "Overread PPS by %d bits\n", -get_bits_left(gb));
-+ ret = AVERROR_INVALIDDATA;
-+ goto err;
-+ }
-+
-+ remove_pps(ps, pps_id);
-+ ps->pps_list[pps_id] = pps_buf;
-+
-+ return 0;
-+
-+err:
-+ av_buffer_unref(&pps_buf);
-+ return ret;
-+}
-+
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type)
-+{
-+ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
-+ int prev_poc_lsb = pocTid0 % max_poc_lsb;
-+ int prev_poc_msb = pocTid0 - prev_poc_lsb;
-+ int poc_msb;
-+
-+ if (poc_lsb < prev_poc_lsb && prev_poc_lsb - poc_lsb >= max_poc_lsb / 2)
-+ poc_msb = prev_poc_msb + max_poc_lsb;
-+ else if (poc_lsb > prev_poc_lsb && poc_lsb - prev_poc_lsb > max_poc_lsb / 2)
-+ poc_msb = prev_poc_msb - max_poc_lsb;
-+ else
-+ poc_msb = prev_poc_msb;
-+
-+ // For BLA picture types, POCmsb is set to 0.
-+ if (nal_unit_type == HEVC_NAL_BLA_W_LP ||
-+ nal_unit_type == HEVC_NAL_BLA_W_RADL ||
-+ nal_unit_type == HEVC_NAL_BLA_N_LP)
-+ poc_msb = 0;
-+
-+ return poc_msb + poc_lsb;
-+}
-diff --git a/libavcodec/rpi_hevc_ps.h b/libavcodec/rpi_hevc_ps.h
-new file mode 100644
-index 0000000000..712464a075
---- /dev/null
-+++ b/libavcodec/rpi_hevc_ps.h
-@@ -0,0 +1,447 @@
-+/*
-+ * HEVC parameter set parsing
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_PS_H
-+#define AVCODEC_RPI_HEVC_PS_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/buffer.h"
-+#include "libavutil/pixfmt.h"
-+#include "libavutil/rational.h"
-+
-+#include "avcodec.h"
-+#include "get_bits.h"
-+#include "hevc.h"
-+
-+typedef struct ShortTermRPS {
-+ unsigned int num_negative_pics;
-+ int num_delta_pocs;
-+ int rps_idx_num_delta_pocs;
-+ int32_t delta_poc[32];
-+ uint8_t used[32];
-+} ShortTermRPS;
-+
-+typedef struct LongTermRPS {
-+ int poc[32];
-+ uint8_t used[32];
-+ uint8_t nb_refs;
-+} LongTermRPS;
-+
-+typedef struct RpiSliceHeader {
-+ unsigned int pps_id;
-+
-+ ///< address (in raster order) of the first block in the current slice segment
-+ unsigned int slice_segment_addr;
-+ ///< address (in raster order) of the first block in the current slice
-+ unsigned int slice_addr;
-+
-+ enum HEVCSliceType slice_type;
-+
-+ int pic_order_cnt_lsb;
-+
-+ uint8_t first_slice_in_pic_flag;
-+ uint8_t dependent_slice_segment_flag;
-+ uint8_t pic_output_flag;
-+ uint8_t colour_plane_id;
-+
-+ ///< RPS coded in the slice header itself is stored here
-+ int short_term_ref_pic_set_sps_flag;
-+ int short_term_ref_pic_set_size;
-+ ShortTermRPS slice_rps;
-+ const ShortTermRPS *short_term_rps;
-+ int long_term_ref_pic_set_size;
-+ LongTermRPS long_term_rps;
-+ unsigned int list_entry_lx[2][32];
-+
-+ uint8_t rpl_modification_flag[2];
-+ uint8_t no_output_of_prior_pics_flag;
-+ uint8_t slice_temporal_mvp_enabled_flag;
-+
-+ unsigned int nb_refs[2];
-+
-+ uint8_t slice_sample_adaptive_offset_flag[3];
-+ uint8_t mvd_l1_zero_flag;
-+
-+ uint8_t cabac_init_flag;
-+ uint8_t disable_deblocking_filter_flag; ///< slice_header_disable_deblocking_filter_flag
-+ uint8_t slice_loop_filter_across_slices_enabled_flag;
-+ uint8_t collocated_list;
-+
-+ uint8_t no_dblk_boundary_flags;
-+
-+ unsigned int collocated_ref_idx;
-+
-+ int slice_qp_delta;
-+ int slice_cb_qp_offset; // -12, +12
-+ int slice_cr_qp_offset; // -12, +12
-+
-+ uint8_t cu_chroma_qp_offset_enabled_flag;
-+
-+ int beta_offset; ///< beta_offset_div2 * 2
-+ int tc_offset; ///< tc_offset_div2 * 2
-+
-+ unsigned int max_num_merge_cand; ///< 5 - 5_minus_max_num_merge_cand
-+
-+ unsigned *entry_point_offset;
-+ int * offset;
-+ int * size;
-+ int num_entry_point_offsets;
-+ int offsets_allocated;
-+
-+ uint8_t offload_wpp;
-+ uint8_t offload_tiles;
-+
-+ int8_t slice_qp;
-+
-+ uint8_t luma_log2_weight_denom;
-+ uint8_t chroma_log2_weight_denom;
-+
-+ int16_t luma_weight_l0[16]; // -128, +255
-+ int16_t luma_offset_l0[16];
-+ int16_t chroma_weight_l0[16][2];
-+ int16_t chroma_offset_l0[16][2];
-+
-+ int16_t luma_weight_l1[16];
-+ int16_t luma_offset_l1[16];
-+ int16_t chroma_weight_l1[16][2];
-+ int16_t chroma_offset_l1[16][2];
-+
-+} RpiSliceHeader;
-+
-+typedef struct HEVCRpiWindow {
-+ uint16_t left_offset;
-+ uint16_t right_offset;
-+ uint16_t top_offset;
-+ uint16_t bottom_offset;
-+} HEVCRpiWindow;
-+
-+typedef struct VUI {
-+ AVRational sar;
-+
-+ int overscan_info_present_flag;
-+ int overscan_appropriate_flag;
-+
-+ int video_signal_type_present_flag;
-+ int video_format;
-+ int video_full_range_flag;
-+ int colour_description_present_flag;
-+ uint8_t colour_primaries;
-+ uint8_t transfer_characteristic;
-+ uint8_t matrix_coeffs;
-+
-+ int chroma_loc_info_present_flag;
-+ int chroma_sample_loc_type_top_field;
-+ int chroma_sample_loc_type_bottom_field;
-+ int neutra_chroma_indication_flag;
-+
-+ int field_seq_flag;
-+ int frame_field_info_present_flag;
-+
-+ int default_display_window_flag;
-+ HEVCRpiWindow def_disp_win;
-+
-+ int vui_timing_info_present_flag;
-+ uint32_t vui_num_units_in_tick;
-+ uint32_t vui_time_scale;
-+ int vui_poc_proportional_to_timing_flag;
-+ int vui_num_ticks_poc_diff_one_minus1;
-+ int vui_hrd_parameters_present_flag;
-+
-+ int bitstream_restriction_flag;
-+ int tiles_fixed_structure_flag;
-+ int motion_vectors_over_pic_boundaries_flag;
-+ int restricted_ref_pic_lists_flag;
-+ int min_spatial_segmentation_idc;
-+ int max_bytes_per_pic_denom;
-+ int max_bits_per_min_cu_denom;
-+ int log2_max_mv_length_horizontal;
-+ int log2_max_mv_length_vertical;
-+} VUI;
-+
-+typedef struct PTLCommon {
-+ uint8_t profile_space;
-+ uint8_t tier_flag;
-+ uint8_t profile_idc;
-+ uint8_t profile_compatibility_flag[32];
-+ uint8_t level_idc;
-+ uint8_t progressive_source_flag;
-+ uint8_t interlaced_source_flag;
-+ uint8_t non_packed_constraint_flag;
-+ uint8_t frame_only_constraint_flag;
-+} PTLCommon;
-+
-+typedef struct PTL {
-+ PTLCommon general_ptl;
-+ PTLCommon sub_layer_ptl[HEVC_MAX_SUB_LAYERS];
-+
-+ uint8_t sub_layer_profile_present_flag[HEVC_MAX_SUB_LAYERS];
-+ uint8_t sub_layer_level_present_flag[HEVC_MAX_SUB_LAYERS];
-+} PTL;
-+
-+typedef struct HEVCRpiVPS {
-+ uint8_t vps_temporal_id_nesting_flag;
-+ int vps_max_layers;
-+ int vps_max_sub_layers; ///< vps_max_temporal_layers_minus1 + 1
-+
-+ PTL ptl;
-+ int vps_sub_layer_ordering_info_present_flag;
-+ unsigned int vps_max_dec_pic_buffering[HEVC_MAX_SUB_LAYERS];
-+ unsigned int vps_num_reorder_pics[HEVC_MAX_SUB_LAYERS];
-+ unsigned int vps_max_latency_increase[HEVC_MAX_SUB_LAYERS];
-+ int vps_max_layer_id;
-+ int vps_num_layer_sets; ///< vps_num_layer_sets_minus1 + 1
-+ uint8_t vps_timing_info_present_flag;
-+ uint32_t vps_num_units_in_tick;
-+ uint32_t vps_time_scale;
-+ uint8_t vps_poc_proportional_to_timing_flag;
-+ int vps_num_ticks_poc_diff_one; ///< vps_num_ticks_poc_diff_one_minus1 + 1
-+ int vps_num_hrd_parameters;
-+
-+ uint8_t data[4096];
-+ int data_size;
-+} HEVCRpiVPS;
-+
-+typedef struct ScalingList {
-+ /* This is a little wasteful, since sizeID 0 only needs 8 coeffs,
-+ * and size ID 3 only has 2 arrays, not 6. */
-+ uint8_t sl[4][6][64];
-+ uint8_t sl_dc[2][6];
-+} ScalingList;
-+
-+typedef struct HEVCRpiSPS {
-+ unsigned vps_id;
-+ uint8_t chroma_format_idc;
-+ uint8_t separate_colour_plane_flag;
-+
-+ HEVCRpiWindow output_window;
-+
-+ HEVCRpiWindow pic_conf_win;
-+
-+ uint16_t wp_offset_half_range; // WpOffsetHalfRange
-+
-+ uint8_t bit_depth;
-+
-+// int bit_depth_chroma; // We only support lum_bit_depth = chroma_bit_depth
-+ uint8_t pixel_shift;
-+ enum AVPixelFormat pix_fmt;
-+
-+ unsigned int log2_max_poc_lsb;
-+
-+ int max_sub_layers;
-+ struct {
-+ int max_dec_pic_buffering;
-+ int num_reorder_pics;
-+ int max_latency_increase;
-+ } temporal_layer[HEVC_MAX_SUB_LAYERS];
-+ uint8_t temporal_id_nesting_flag;
-+
-+ uint8_t scaling_list_enable_flag;
-+ ScalingList scaling_list;
-+
-+ unsigned int nb_st_rps;
-+ ShortTermRPS st_rps[HEVC_MAX_SHORT_TERM_REF_PIC_SETS];
-+
-+ uint8_t amp_enabled_flag;
-+ uint8_t sao_enabled;
-+
-+ uint8_t long_term_ref_pics_present_flag;
-+ uint16_t lt_ref_pic_poc_lsb_sps[HEVC_MAX_LONG_TERM_REF_PICS];
-+ uint8_t used_by_curr_pic_lt_sps_flag[HEVC_MAX_LONG_TERM_REF_PICS];
-+ uint8_t num_long_term_ref_pics_sps;
-+
-+ struct {
-+ uint8_t bit_depth;
-+ uint8_t bit_depth_chroma;
-+ uint8_t log2_min_pcm_cb_size;
-+ uint8_t log2_max_pcm_cb_size;
-+ uint8_t loop_filter_disable_flag;
-+ } pcm;
-+ uint8_t sps_temporal_mvp_enabled_flag;
-+ uint8_t sps_strong_intra_smoothing_enable_flag;
-+
-+ unsigned int log2_min_cb_size; // 3..6
-+ unsigned int log2_diff_max_min_coding_block_size;
-+ unsigned int log2_min_tb_size; // 2..5
-+ unsigned int log2_max_trafo_size;
-+ unsigned int log2_ctb_size; // 4..6
-+// unsigned int log2_min_pu_size; // 2..5 (min_cb_size - 1)
-+#define LOG2_MIN_PU_SIZE 2
-+#define LOG2_MIN_CU_SIZE 3
-+
-+ int max_transform_hierarchy_depth_inter;
-+ int max_transform_hierarchy_depth_intra;
-+
-+ int transform_skip_rotation_enabled_flag;
-+ int transform_skip_context_enabled_flag;
-+ int implicit_rdpcm_enabled_flag;
-+ int explicit_rdpcm_enabled_flag;
-+ int intra_smoothing_disabled_flag;
-+ int high_precision_offsets_enabled_flag;
-+ int persistent_rice_adaptation_enabled_flag;
-+
-+ ///< coded frame dimension in various units
-+ int width;
-+ int height;
-+ int ctb_width;
-+ int ctb_height;
-+ int ctb_size; // Pic size in CTBs not size of a CTB
-+ int min_cb_width;
-+ int min_cb_height;
-+ int min_tb_width;
-+ int min_tb_height;
-+ int min_pu_width;
-+ int min_pu_height;
-+ int pcm_width;
-+ int pcm_height;
-+ int tb_mask;
-+
-+ int hshift[3];
-+ int vshift[3];
-+
-+ int qp_bd_offset;
-+
-+ uint8_t data[4096];
-+ int data_size;
-+
-+ VUI vui;
-+ PTL ptl;
-+} HEVCRpiSPS;
-+
-+#define CTB_TS_FLAGS_SOTL (1U << 0) // X start of tile line
-+#define CTB_TS_FLAGS_EOTL (1U << 1) // Last CTB of a tile line
-+#define CTB_TS_FLAGS_EOL (1U << 2) // Last CTB of a complete line
-+#define CTB_TS_FLAGS_EOT (1U << 3) // Last CTB of a tile
-+#define CTB_TS_FLAGS_CSAVE (1U << 4)
-+#define CTB_TS_FLAGS_CIREQ (1U << 5) // Cabac init request
-+#define CTB_TS_FLAGS_TOT (1U << 6) // CTB on top row of a tile
-+#define CTB_TS_FLAGS_CLOAD (1U << 7)
-+
-+typedef struct HEVCRpiPPS {
-+ unsigned int sps_id; ///< seq_parameter_set_id
-+
-+ uint8_t sign_data_hiding_flag;
-+
-+ uint8_t cabac_init_present_flag;
-+
-+ int num_ref_idx_l0_default_active; ///< num_ref_idx_l0_default_active_minus1 + 1
-+ int num_ref_idx_l1_default_active; ///< num_ref_idx_l1_default_active_minus1 + 1
-+ int pic_init_qp_minus26;
-+
-+ uint8_t constrained_intra_pred_flag;
-+ uint8_t transform_skip_enabled_flag;
-+
-+ uint8_t cu_qp_delta_enabled_flag;
-+ uint8_t log2_min_cu_qp_delta_size;
-+ int cb_qp_offset; // -12..12
-+ int cr_qp_offset; // -12..12
-+ const uint8_t * qp_dblk_x[3];
-+ const int8_t * qp_bd_x[3];
-+
-+ uint8_t pic_slice_level_chroma_qp_offsets_present_flag;
-+ uint8_t weighted_pred_flag;
-+ uint8_t weighted_bipred_flag;
-+ uint8_t output_flag_present_flag;
-+ uint8_t transquant_bypass_enable_flag;
-+
-+ uint8_t dependent_slice_segments_enabled_flag;
-+ uint8_t tiles_enabled_flag;
-+ uint8_t entropy_coding_sync_enabled_flag;
-+
-+ uint8_t tile_wpp_inter_disable;
-+ int num_tile_columns; ///< num_tile_columns_minus1 + 1
-+ int num_tile_rows; ///< num_tile_rows_minus1 + 1
-+ uint8_t uniform_spacing_flag;
-+ uint8_t loop_filter_across_tiles_enabled_flag;
-+
-+ uint8_t seq_loop_filter_across_slices_enabled_flag;
-+
-+ uint8_t deblocking_filter_control_present_flag;
-+ uint8_t deblocking_filter_override_enabled_flag;
-+ uint8_t disable_dbf;
-+ int beta_offset; ///< beta_offset_div2 * 2
-+ int tc_offset; ///< tc_offset_div2 * 2
-+
-+ uint8_t scaling_list_data_present_flag;
-+ ScalingList scaling_list;
-+
-+ uint8_t lists_modification_present_flag;
-+ int log2_parallel_merge_level; ///< log2_parallel_merge_level_minus2 + 2
-+ int num_extra_slice_header_bits;
-+ uint8_t slice_header_extension_present_flag;
-+ uint8_t log2_max_transform_skip_block_size;
-+ uint8_t cross_component_prediction_enabled_flag;
-+ uint8_t chroma_qp_offset_list_enabled_flag;
-+ uint8_t diff_cu_chroma_qp_offset_depth;
-+ uint8_t chroma_qp_offset_list_len_minus1;
-+ int8_t cb_qp_offset_list[6];
-+ int8_t cr_qp_offset_list[6];
-+ uint8_t log2_sao_offset_scale_luma;
-+ uint8_t log2_sao_offset_scale_chroma;
-+
-+ // Inferred parameters
-+ uint16_t *column_width; ///< ColumnWidth
-+ uint16_t *row_height; ///< RowHeight
-+ uint16_t *col_bd; ///< ColBd
-+ uint16_t *row_bd; ///< RowBd
-+ uint16_t *col_idxX;
-+
-+ // We can limit these to uint16_t given our other size limits
-+ uint16_t *ctb_addr_rs_to_ts; ///< CtbAddrRSToTS
-+ uint16_t *ctb_addr_ts_to_rs; ///< CtbAddrTSToRS
-+ uint16_t *tile_id; ///< TileId
-+ uint16_t *tile_pos_ts; ///< TilePosRS
-+ uint16_t *tile_size; ///< TileSize
-+ uint8_t * ctb_ts_flags;
-+
-+ uint8_t data[4096];
-+ int data_size;
-+} HEVCRpiPPS;
-+
-+typedef struct HEVCRpiParamSets {
-+ /* currently active parameter sets */
-+ const HEVCRpiVPS *vps;
-+ const HEVCRpiSPS *sps;
-+ const HEVCRpiPPS *pps;
-+
-+ AVBufferRef *vps_list[HEVC_MAX_VPS_COUNT];
-+ AVBufferRef *sps_list[HEVC_MAX_SPS_COUNT];
-+ AVBufferRef *pps_list[HEVC_MAX_PPS_COUNT];
-+} HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_vps(GetBitContext *gb, AVCodecContext *avctx,
-+ HEVCRpiParamSets *ps);
-+int ff_hevc_rpi_decode_nal_sps(GetBitContext *gb, AVCodecContext *avctx,
-+ HEVCRpiParamSets *ps, int apply_defdispwin);
-+int ff_hevc_rpi_decode_nal_pps(GetBitContext *gb, AVCodecContext *avctx,
-+ HEVCRpiParamSets *ps);
-+
-+int ff_hevc_rpi_decode_short_term_rps(GetBitContext *gb, AVCodecContext *avctx,
-+ ShortTermRPS *rps, const HEVCRpiSPS *sps, int is_slice_header);
-+
-+int ff_hevc_rpi_encode_nal_vps(HEVCRpiVPS *vps, unsigned int id,
-+ uint8_t *buf, int buf_size);
-+
-+/**
-+ * Compute POC of the current frame and return it.
-+ */
-+int ff_hevc_rpi_compute_poc(const HEVCRpiSPS *sps, int pocTid0, int poc_lsb, int nal_unit_type);
-+
-+#endif /* AVCODEC_RPI_HEVC_PS_H */
-diff --git a/libavcodec/rpi_hevc_refs.c b/libavcodec/rpi_hevc_refs.c
-new file mode 100644
-index 0000000000..8cc5796cf0
---- /dev/null
-+++ b/libavcodec/rpi_hevc_refs.c
-@@ -0,0 +1,485 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags)
-+{
-+ /* frame->frame can be NULL if context init failed */
-+ if (!frame->frame || !frame->frame->buf[0])
-+ return;
-+
-+ frame->flags &= ~flags;
-+ if (!frame->flags) {
-+ ff_thread_release_buffer(s->avctx, &frame->tf);
-+
-+ av_buffer_unref(&frame->col_mvf_buf); // OK if already NULL
-+ frame->col_mvf = NULL;
-+
-+ frame->collocated_ref = NULL;
-+ }
-+}
-+
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s)
-+{
-+ int i;
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+ ff_hevc_rpi_unref_frame(s, &s->DPB[i],
-+ HEVC_FRAME_FLAG_SHORT_REF |
-+ HEVC_FRAME_FLAG_LONG_REF);
-+}
-+
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s)
-+{
-+ int i;
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+}
-+
-+static HEVCRpiFrame *alloc_frame(HEVCRpiContext * const s)
-+{
-+ int i, ret;
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame * const frame = &s->DPB[i];
-+ if (frame->frame->buf[0])
-+ continue;
-+
-+ ret = ff_thread_get_buffer(s->avctx, &frame->tf,
-+ AV_GET_BUFFER_FLAG_REF);
-+ if (ret < 0)
-+ return NULL;
-+
-+ frame->col_mvf = NULL;
-+ frame->col_mvf_buf = NULL;
-+ if (s->used_for_ref && !s->is_irap)
-+ {
-+ frame->col_mvf_buf = av_buffer_pool_get(s->col_mvf_pool);
-+ if (!frame->col_mvf_buf)
-+ goto fail;
-+ frame->col_mvf = (ColMvField *)frame->col_mvf_buf->data;
-+ }
-+
-+ frame->frame->top_field_first = s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD;
-+ frame->frame->interlaced_frame = (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_TOP_FIELD) || (s->sei.picture_timing.picture_struct == AV_PICTURE_STRUCTURE_BOTTOM_FIELD);
-+
-+ return frame;
-+
-+fail:
-+ ff_hevc_rpi_unref_frame(s, frame, ~0);
-+ return NULL;
-+ }
-+ av_log(s->avctx, AV_LOG_ERROR, "Error allocating frame, DPB full.\n");
-+ return NULL;
-+}
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc)
-+{
-+ HEVCRpiFrame *ref;
-+ int i;
-+
-+ /* check that this POC doesn't already exist */
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+
-+ if (frame->frame->buf[0] && frame->sequence == s->seq_decode &&
-+ frame->poc == poc) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Duplicate POC in a sequence: %d.\n",
-+ poc);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+
-+ ref = alloc_frame(s);
-+ if (!ref)
-+ return AVERROR(ENOMEM);
-+
-+ *frame = ref->frame;
-+ s->ref = ref;
-+
-+ if (s->sh.pic_output_flag)
-+ ref->flags = HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_SHORT_REF;
-+ else
-+ ref->flags = HEVC_FRAME_FLAG_SHORT_REF;
-+
-+ ref->poc = poc;
-+ ref->sequence = s->seq_decode;
-+ ref->frame->crop_left = s->ps.sps->output_window.left_offset;
-+ ref->frame->crop_right = s->ps.sps->output_window.right_offset;
-+ ref->frame->crop_top = s->ps.sps->output_window.top_offset;
-+ ref->frame->crop_bottom = s->ps.sps->output_window.bottom_offset;
-+
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *out, int flush)
-+{
-+ do {
-+ int nb_output = 0;
-+ int min_poc = INT_MAX;
-+ int i, min_idx, ret;
-+
-+ if (s->sh.no_output_of_prior_pics_flag == 1 && s->no_rasl_output_flag == 1) {
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+ if (!(frame->flags & HEVC_FRAME_FLAG_BUMPING) && frame->poc != s->poc &&
-+ frame->sequence == s->seq_output) {
-+ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+ }
-+ }
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+ if ((frame->flags & HEVC_FRAME_FLAG_OUTPUT) &&
-+ frame->sequence == s->seq_output) {
-+ nb_output++;
-+ if (frame->poc < min_poc || nb_output == 1) {
-+ min_poc = frame->poc;
-+ min_idx = i;
-+ }
-+ }
-+ }
-+
-+ /* wait for more frames before output */
-+ if (!flush && s->seq_output == s->seq_decode && s->ps.sps &&
-+ nb_output <= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].num_reorder_pics)
-+ return 0;
-+
-+ if (nb_output) {
-+ HEVCRpiFrame *frame = &s->DPB[min_idx];
-+ if (frame->frame->format == AV_PIX_FMT_VIDEOTOOLBOX && frame->frame->buf[0]->size == 1)
-+ return 0;
-+
-+ ret = av_frame_ref(out, frame->frame);
-+ if (frame->flags & HEVC_FRAME_FLAG_BUMPING)
-+ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT | HEVC_FRAME_FLAG_BUMPING);
-+ else
-+ ff_hevc_rpi_unref_frame(s, frame, HEVC_FRAME_FLAG_OUTPUT);
-+ if (ret < 0)
-+ return ret;
-+ av_log(s->avctx, AV_LOG_DEBUG,
-+ "Output frame with POC %d.\n", frame->poc);
-+ return 1;
-+ }
-+
-+ if (s->seq_output != s->seq_decode)
-+ s->seq_output = (s->seq_output + 1) & 0xff;
-+ else
-+ break;
-+ } while (1);
-+
-+ return 0;
-+}
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s)
-+{
-+ int dpb = 0;
-+ int min_poc = INT_MAX;
-+ int i;
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+ if ((frame->flags) &&
-+ frame->sequence == s->seq_output &&
-+ frame->poc != s->poc) {
-+ dpb++;
-+ }
-+ }
-+
-+ if (s->ps.sps && dpb >= s->ps.sps->temporal_layer[s->ps.sps->max_sub_layers - 1].max_dec_pic_buffering) {
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+ if ((frame->flags) &&
-+ frame->sequence == s->seq_output &&
-+ frame->poc != s->poc) {
-+ if (frame->flags == HEVC_FRAME_FLAG_OUTPUT && frame->poc < min_poc) {
-+ min_poc = frame->poc;
-+ }
-+ }
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+ if (frame->flags & HEVC_FRAME_FLAG_OUTPUT &&
-+ frame->sequence == s->seq_output &&
-+ frame->poc <= min_poc) {
-+ frame->flags |= HEVC_FRAME_FLAG_BUMPING;
-+ }
-+ }
-+
-+ dpb--;
-+ }
-+}
-+
-+static int init_slice_rpl(HEVCRpiContext *s)
-+{
-+ if (s->slice_idx >= s->rpl_tab_size)
-+ return AVERROR_INVALIDDATA;
-+
-+ s->refPicList = s->rpl_tab[s->slice_idx].refPicList + 0;
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s)
-+{
-+ RpiSliceHeader *sh = &s->sh;
-+
-+ uint8_t nb_list = sh->slice_type == HEVC_SLICE_B ? 2 : 1;
-+ uint8_t list_idx;
-+ int i, j, ret;
-+
-+ ret = init_slice_rpl(s);
-+ if (ret < 0)
-+ return ret;
-+
-+ if (!(s->rps[ST_CURR_BEF].nb_refs + s->rps[ST_CURR_AFT].nb_refs +
-+ s->rps[LT_CURR].nb_refs)) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Zero refs in the frame RPS.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ for (list_idx = 0; list_idx < nb_list; list_idx++) {
-+ RefPicList rpl_tmp = { { 0 } };
-+ RefPicList *rpl = &s->refPicList[list_idx];
-+
-+ /* The order of the elements is
-+ * ST_CURR_BEF - ST_CURR_AFT - LT_CURR for the L0 and
-+ * ST_CURR_AFT - ST_CURR_BEF - LT_CURR for the L1 */
-+ int cand_lists[3] = { list_idx ? ST_CURR_AFT : ST_CURR_BEF,
-+ list_idx ? ST_CURR_BEF : ST_CURR_AFT,
-+ LT_CURR };
-+
-+ /* concatenate the candidate lists for the current frame */
-+ while (rpl_tmp.nb_refs < sh->nb_refs[list_idx]) {
-+ for (i = 0; i < FF_ARRAY_ELEMS(cand_lists); i++) {
-+ RefPicList *rps = &s->rps[cand_lists[i]];
-+ for (j = 0; j < rps->nb_refs && rpl_tmp.nb_refs < HEVC_MAX_REFS; j++) {
-+ rpl_tmp.list[rpl_tmp.nb_refs] = rps->list[j];
-+ rpl_tmp.ref[rpl_tmp.nb_refs] = rps->ref[j];
-+ rpl_tmp.isLongTerm[rpl_tmp.nb_refs] = i == 2;
-+ rpl_tmp.nb_refs++;
-+ }
-+ }
-+ }
-+
-+ /* reorder the references if necessary */
-+ if (sh->rpl_modification_flag[list_idx]) {
-+ for (i = 0; i < sh->nb_refs[list_idx]; i++) {
-+ int idx = sh->list_entry_lx[list_idx][i];
-+
-+ if (idx >= rpl_tmp.nb_refs) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Invalid reference index.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ rpl->list[i] = rpl_tmp.list[idx];
-+ rpl->ref[i] = rpl_tmp.ref[idx];
-+ rpl->isLongTerm[i] = rpl_tmp.isLongTerm[idx];
-+ rpl->nb_refs++;
-+ }
-+ } else {
-+ memcpy(rpl, &rpl_tmp, sizeof(*rpl));
-+ rpl->nb_refs = FFMIN(rpl->nb_refs, sh->nb_refs[list_idx]);
-+ }
-+
-+ if (sh->collocated_list == list_idx &&
-+ sh->collocated_ref_idx < rpl->nb_refs)
-+ s->ref->collocated_ref = rpl->ref[sh->collocated_ref_idx];
-+ }
-+
-+ return 0;
-+}
-+
-+static HEVCRpiFrame *find_ref_idx(HEVCRpiContext *s, int poc)
-+{
-+ int i;
-+ int LtMask = (1 << s->ps.sps->log2_max_poc_lsb) - 1;
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *ref = &s->DPB[i];
-+ if (ref->frame->buf[0] && (ref->sequence == s->seq_decode)) {
-+ if ((ref->poc & LtMask) == poc)
-+ return ref;
-+ }
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *ref = &s->DPB[i];
-+ if (ref->frame->buf[0] && ref->sequence == s->seq_decode) {
-+ if (ref->poc == poc || (ref->poc & LtMask) == poc)
-+ return ref;
-+ }
-+ }
-+
-+ if (s->nal_unit_type != HEVC_NAL_CRA_NUT && !IS_BLA(s))
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Could not find ref with POC %d\n", poc);
-+ return NULL;
-+}
-+
-+static void mark_ref(HEVCRpiFrame *frame, int flag)
-+{
-+ frame->flags &= ~(HEVC_FRAME_FLAG_LONG_REF | HEVC_FRAME_FLAG_SHORT_REF);
-+ frame->flags |= flag;
-+}
-+
-+static HEVCRpiFrame *generate_missing_ref(HEVCRpiContext *s, int poc)
-+{
-+ HEVCRpiFrame *frame;
-+ int i, x, y;
-+
-+ frame = alloc_frame(s);
-+ if (!frame)
-+ return NULL;
-+
-+ if (!s->ps.sps->pixel_shift) {
-+ for (i = 0; frame->frame->buf[i]; i++)
-+ memset(frame->frame->buf[i]->data, 1 << (s->ps.sps->bit_depth - 1),
-+ frame->frame->buf[i]->size);
-+ } else {
-+ for (i = 0; frame->frame->data[i]; i++)
-+ for (y = 0; y < (s->ps.sps->height >> s->ps.sps->vshift[i]); y++)
-+ for (x = 0; x < (s->ps.sps->width >> s->ps.sps->hshift[i]); x++) {
-+ AV_WN16(frame->frame->data[i] + y * frame_stride1(frame->frame, 1) + 2 * x,
-+ 1 << (s->ps.sps->bit_depth - 1));
-+ }
-+ }
-+
-+ frame->poc = poc;
-+ frame->sequence = s->seq_decode;
-+ frame->flags = 0;
-+
-+ ff_hevc_rpi_progress_set_all_done(frame);
-+
-+ return frame;
-+}
-+
-+/* add a reference with the given poc to the list and mark it as used in DPB */
-+static int add_candidate_ref(HEVCRpiContext *s, RefPicList *list,
-+ int poc, int ref_flag)
-+{
-+ HEVCRpiFrame *ref = find_ref_idx(s, poc);
-+
-+ if (ref == s->ref || list->nb_refs >= HEVC_MAX_REFS)
-+ return AVERROR_INVALIDDATA;
-+
-+ if (!ref) {
-+ ref = generate_missing_ref(s, poc);
-+ if (!ref)
-+ return AVERROR(ENOMEM);
-+ }
-+
-+ list->list[list->nb_refs] = ref->poc;
-+ list->ref[list->nb_refs] = ref;
-+ list->nb_refs++;
-+
-+ mark_ref(ref, ref_flag);
-+ return 0;
-+}
-+
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s)
-+{
-+ const ShortTermRPS *short_rps = s->sh.short_term_rps;
-+ const LongTermRPS *long_rps = &s->sh.long_term_rps;
-+ RefPicList *rps = s->rps;
-+ int i, ret = 0;
-+
-+ if (!short_rps) {
-+ rps[0].nb_refs = rps[1].nb_refs = 0;
-+ return 0;
-+ }
-+
-+ /* clear the reference flags on all frames except the current one */
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ HEVCRpiFrame *frame = &s->DPB[i];
-+
-+ if (frame == s->ref)
-+ continue;
-+
-+ mark_ref(frame, 0);
-+ }
-+
-+ for (i = 0; i < NB_RPS_TYPE; i++)
-+ rps[i].nb_refs = 0;
-+
-+ /* add the short refs */
-+ for (i = 0; i < short_rps->num_delta_pocs; i++) {
-+ int poc = s->poc + short_rps->delta_poc[i];
-+ int list;
-+
-+ if (!short_rps->used[i])
-+ list = ST_FOLL;
-+ else if (i < short_rps->num_negative_pics)
-+ list = ST_CURR_BEF;
-+ else
-+ list = ST_CURR_AFT;
-+
-+ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_SHORT_REF);
-+ if (ret < 0)
-+ goto fail;
-+ }
-+
-+ /* add the long refs */
-+ for (i = 0; i < long_rps->nb_refs; i++) {
-+ int poc = long_rps->poc[i];
-+ int list = long_rps->used[i] ? LT_CURR : LT_FOLL;
-+
-+ ret = add_candidate_ref(s, &rps[list], poc, HEVC_FRAME_FLAG_LONG_REF);
-+ if (ret < 0)
-+ goto fail;
-+ }
-+
-+fail:
-+ /* release any frames that are now unused */
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++)
-+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], 0);
-+
-+ return ret;
-+}
-+
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s)
-+{
-+ int ret = 0;
-+ int i;
-+ const ShortTermRPS *rps = s->sh.short_term_rps;
-+ LongTermRPS *long_rps = &s->sh.long_term_rps;
-+
-+ if (rps) {
-+ for (i = 0; i < rps->num_negative_pics; i++)
-+ ret += !!rps->used[i];
-+ for (; i < rps->num_delta_pocs; i++)
-+ ret += !!rps->used[i];
-+ }
-+
-+ if (long_rps) {
-+ for (i = 0; i < long_rps->nb_refs; i++)
-+ ret += !!long_rps->used[i];
-+ }
-+ return ret;
-+}
-diff --git a/libavcodec/rpi_hevc_sei.c b/libavcodec/rpi_hevc_sei.c
-new file mode 100644
-index 0000000000..cd8149d58e
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.c
-@@ -0,0 +1,368 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2013 Vittorio Giovara
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "golomb.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+
-+static int decode_nal_sei_decoded_picture_hash(HEVCSEIPictureHash *s, GetBitContext *gb)
-+{
-+ int cIdx, i;
-+ uint8_t hash_type;
-+ //uint16_t picture_crc;
-+ //uint32_t picture_checksum;
-+ hash_type = get_bits(gb, 8);
-+
-+ for (cIdx = 0; cIdx < 3/*((s->sps->chroma_format_idc == 0) ? 1 : 3)*/; cIdx++) {
-+ if (hash_type == 0) {
-+ s->is_md5 = 1;
-+ for (i = 0; i < 16; i++)
-+ s->md5[cIdx][i] = get_bits(gb, 8);
-+ } else if (hash_type == 1) {
-+ // picture_crc = get_bits(gb, 16);
-+ skip_bits(gb, 16);
-+ } else if (hash_type == 2) {
-+ // picture_checksum = get_bits_long(gb, 32);
-+ skip_bits(gb, 32);
-+ }
-+ }
-+ return 0;
-+}
-+
-+static int decode_nal_sei_mastering_display_info(HEVCSEIMasteringDisplay *s, GetBitContext *gb)
-+{
-+ int i;
-+ // Mastering primaries
-+ for (i = 0; i < 3; i++) {
-+ s->display_primaries[i][0] = get_bits(gb, 16);
-+ s->display_primaries[i][1] = get_bits(gb, 16);
-+ }
-+ // White point (x, y)
-+ s->white_point[0] = get_bits(gb, 16);
-+ s->white_point[1] = get_bits(gb, 16);
-+
-+ // Max and min luminance of mastering display
-+ s->max_luminance = get_bits_long(gb, 32);
-+ s->min_luminance = get_bits_long(gb, 32);
-+
-+ // As this SEI message comes before the first frame that references it,
-+ // initialize the flag to 2 and decrement on IRAP access unit so it
-+ // persists for the coded video sequence (e.g., between two IRAPs)
-+ s->present = 2;
-+ return 0;
-+}
-+
-+static int decode_nal_sei_content_light_info(HEVCSEIContentLight *s, GetBitContext *gb)
-+{
-+ // Max and average light levels
-+ s->max_content_light_level = get_bits_long(gb, 16);
-+ s->max_pic_average_light_level = get_bits_long(gb, 16);
-+ // As this SEI message comes before the first frame that references it,
-+ // initialize the flag to 2 and decrement on IRAP access unit so it
-+ // persists for the coded video sequence (e.g., between two IRAPs)
-+ s->present = 2;
-+ return 0;
-+}
-+
-+static int decode_nal_sei_frame_packing_arrangement(HEVCSEIFramePacking *s, GetBitContext *gb)
-+{
-+ get_ue_golomb_long(gb); // frame_packing_arrangement_id
-+ s->present = !get_bits1(gb);
-+
-+ if (s->present) {
-+ s->arrangement_type = get_bits(gb, 7);
-+ s->quincunx_subsampling = get_bits1(gb);
-+ s->content_interpretation_type = get_bits(gb, 6);
-+
-+ // spatial_flipping_flag, frame0_flipped_flag, field_views_flag
-+ skip_bits(gb, 3);
-+ s->current_frame_is_frame0_flag = get_bits1(gb);
-+ // frame0_self_contained_flag, frame1_self_contained_flag
-+ skip_bits(gb, 2);
-+
-+ if (!s->quincunx_subsampling && s->arrangement_type != 5)
-+ skip_bits(gb, 16); // frame[01]_grid_position_[xy]
-+ skip_bits(gb, 8); // frame_packing_arrangement_reserved_byte
-+ skip_bits1(gb); // frame_packing_arrangement_persistence_flag
-+ }
-+ skip_bits1(gb); // upsampled_aspect_ratio_flag
-+ return 0;
-+}
-+
-+static int decode_nal_sei_display_orientation(HEVCSEIDisplayOrientation *s, GetBitContext *gb)
-+{
-+ s->present = !get_bits1(gb);
-+
-+ if (s->present) {
-+ s->hflip = get_bits1(gb); // hor_flip
-+ s->vflip = get_bits1(gb); // ver_flip
-+
-+ s->anticlockwise_rotation = get_bits(gb, 16);
-+ skip_bits1(gb); // display_orientation_persistence_flag
-+ }
-+
-+ return 0;
-+}
-+
-+static int decode_nal_sei_pic_timing(HEVCSEIContext *s, GetBitContext *gb, const HEVCRpiParamSets *ps,
-+ void *logctx, int size)
-+{
-+ HEVCSEIPictureTiming *h = &s->picture_timing;
-+ HEVCRpiSPS *sps;
-+
-+ if (!ps->sps_list[s->active_seq_parameter_set_id])
-+ return(AVERROR(ENOMEM));
-+ sps = (HEVCRpiSPS*)ps->sps_list[s->active_seq_parameter_set_id]->data;
-+
-+ if (sps->vui.frame_field_info_present_flag) {
-+ int pic_struct = get_bits(gb, 4);
-+ h->picture_struct = AV_PICTURE_STRUCTURE_UNKNOWN;
-+ if (pic_struct == 2 || pic_struct == 10 || pic_struct == 12) {
-+ av_log(logctx, AV_LOG_DEBUG, "BOTTOM Field\n");
-+ h->picture_struct = AV_PICTURE_STRUCTURE_BOTTOM_FIELD;
-+ } else if (pic_struct == 1 || pic_struct == 9 || pic_struct == 11) {
-+ av_log(logctx, AV_LOG_DEBUG, "TOP Field\n");
-+ h->picture_struct = AV_PICTURE_STRUCTURE_TOP_FIELD;
-+ }
-+ get_bits(gb, 2); // source_scan_type
-+ get_bits(gb, 1); // duplicate_flag
-+ skip_bits1(gb);
-+ size--;
-+ }
-+ skip_bits_long(gb, 8 * size);
-+
-+ return 0;
-+}
-+
-+static int decode_registered_user_data_closed_caption(HEVCSEIA53Caption *s, GetBitContext *gb,
-+ int size)
-+{
-+ int flag;
-+ int user_data_type_code;
-+ int cc_count;
-+
-+ if (size < 3)
-+ return AVERROR(EINVAL);
-+
-+ user_data_type_code = get_bits(gb, 8);
-+ if (user_data_type_code == 0x3) {
-+ skip_bits(gb, 1); // reserved
-+
-+ flag = get_bits(gb, 1); // process_cc_data_flag
-+ if (flag) {
-+ skip_bits(gb, 1);
-+ cc_count = get_bits(gb, 5);
-+ skip_bits(gb, 8); // reserved
-+ size -= 2;
-+
-+ if (cc_count && size >= cc_count * 3) {
-+ const uint64_t new_size = (s->a53_caption_size + cc_count
-+ * UINT64_C(3));
-+ int i, ret;
-+
-+ if (new_size > INT_MAX)
-+ return AVERROR(EINVAL);
-+
-+ /* Allow merging of the cc data from two fields. */
-+ ret = av_reallocp(&s->a53_caption, new_size);
-+ if (ret < 0)
-+ return ret;
-+
-+ for (i = 0; i < cc_count; i++) {
-+ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+ s->a53_caption[s->a53_caption_size++] = get_bits(gb, 8);
-+ }
-+ skip_bits(gb, 8); // marker_bits
-+ }
-+ }
-+ } else {
-+ int i;
-+ for (i = 0; i < size - 1; i++)
-+ skip_bits(gb, 8);
-+ }
-+
-+ return 0;
-+}
-+
-+static int decode_nal_sei_user_data_registered_itu_t_t35(HEVCSEIContext *s, GetBitContext *gb,
-+ int size)
-+{
-+ uint32_t country_code;
-+ uint32_t user_identifier;
-+
-+ if (size < 7)
-+ return AVERROR(EINVAL);
-+ size -= 7;
-+
-+ country_code = get_bits(gb, 8);
-+ if (country_code == 0xFF) {
-+ skip_bits(gb, 8);
-+ size--;
-+ }
-+
-+ skip_bits(gb, 8);
-+ skip_bits(gb, 8);
-+
-+ user_identifier = get_bits_long(gb, 32);
-+
-+ switch (user_identifier) {
-+ case MKBETAG('G', 'A', '9', '4'):
-+ return decode_registered_user_data_closed_caption(&s->a53_caption, gb, size);
-+ default:
-+ skip_bits_long(gb, size * 8);
-+ break;
-+ }
-+ return 0;
-+}
-+
-+static int decode_nal_sei_active_parameter_sets(HEVCSEIContext *s, GetBitContext *gb, void *logctx)
-+{
-+ int num_sps_ids_minus1;
-+ int i;
-+ unsigned active_seq_parameter_set_id;
-+
-+ get_bits(gb, 4); // active_video_parameter_set_id
-+ get_bits(gb, 1); // self_contained_cvs_flag
-+ get_bits(gb, 1); // num_sps_ids_minus1
-+ num_sps_ids_minus1 = get_ue_golomb_long(gb); // num_sps_ids_minus1
-+
-+ if (num_sps_ids_minus1 < 0 || num_sps_ids_minus1 > 15) {
-+ av_log(logctx, AV_LOG_ERROR, "num_sps_ids_minus1 %d invalid\n", num_sps_ids_minus1);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ active_seq_parameter_set_id = get_ue_golomb_long(gb);
-+ if (active_seq_parameter_set_id >= HEVC_MAX_SPS_COUNT) {
-+ av_log(logctx, AV_LOG_ERROR, "active_parameter_set_id %d invalid\n", active_seq_parameter_set_id);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ s->active_seq_parameter_set_id = active_seq_parameter_set_id;
-+
-+ for (i = 1; i <= num_sps_ids_minus1; i++)
-+ get_ue_golomb_long(gb); // active_seq_parameter_set_id[i]
-+
-+ return 0;
-+}
-+
-+static int decode_nal_sei_alternative_transfer(HEVCSEIAlternativeTransfer *s, GetBitContext *gb)
-+{
-+ s->present = 1;
-+ s->preferred_transfer_characteristics = get_bits(gb, 8);
-+ return 0;
-+}
-+
-+static int decode_nal_sei_prefix(GetBitContext *gb, void *logctx, HEVCSEIContext *s, const HEVCRpiParamSets *ps,
-+ int type, int size)
-+{
-+ switch (type) {
-+ case 256: // Mismatched value from HM 8.1
-+ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+ case HEVC_SEI_TYPE_FRAME_PACKING:
-+ return decode_nal_sei_frame_packing_arrangement(&s->frame_packing, gb);
-+ case HEVC_SEI_TYPE_DISPLAY_ORIENTATION:
-+ return decode_nal_sei_display_orientation(&s->display_orientation, gb);
-+ case HEVC_SEI_TYPE_PICTURE_TIMING:
-+ return decode_nal_sei_pic_timing(s, gb, ps, logctx, size);
-+ case HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO:
-+ return decode_nal_sei_mastering_display_info(&s->mastering_display, gb);
-+ case HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO:
-+ return decode_nal_sei_content_light_info(&s->content_light, gb);
-+ case HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS:
-+ return decode_nal_sei_active_parameter_sets(s, gb, logctx);
-+ case HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35:
-+ return decode_nal_sei_user_data_registered_itu_t_t35(s, gb, size);
-+ case HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS:
-+ return decode_nal_sei_alternative_transfer(&s->alternative_transfer, gb);
-+ default:
-+ av_log(logctx, AV_LOG_DEBUG, "Skipped PREFIX SEI %d\n", type);
-+ skip_bits_long(gb, 8 * size);
-+ return 0;
-+ }
-+}
-+
-+static int decode_nal_sei_suffix(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+ int type, int size)
-+{
-+ switch (type) {
-+ case HEVC_SEI_TYPE_DECODED_PICTURE_HASH:
-+ return decode_nal_sei_decoded_picture_hash(&s->picture_hash, gb);
-+ default:
-+ av_log(logctx, AV_LOG_DEBUG, "Skipped SUFFIX SEI %d\n", type);
-+ skip_bits_long(gb, 8 * size);
-+ return 0;
-+ }
-+}
-+
-+static int decode_nal_sei_message(GetBitContext * const gb, void * const logctx, HEVCSEIContext * const s,
-+ const HEVCRpiParamSets * const ps, const int nal_unit_type)
-+{
-+ int payload_type = 0;
-+ int payload_size = 0;
-+ int byte = 0xFF;
-+ av_log(logctx, AV_LOG_DEBUG, "Decoding SEI\n");
-+
-+ while (byte == 0xFF) {
-+ if (get_bits_left(gb) < 16 || payload_type > INT_MAX - 255)
-+ return AVERROR_INVALIDDATA;
-+ byte = get_bits(gb, 8);
-+ payload_type += byte;
-+ }
-+ byte = 0xFF;
-+ while (byte == 0xFF) {
-+ if (get_bits_left(gb) < 8 + 8LL*payload_size)
-+ return AVERROR_INVALIDDATA;
-+ byte = get_bits(gb, 8);
-+ payload_size += byte;
-+ }
-+ if (nal_unit_type == HEVC_NAL_SEI_PREFIX) {
-+ return decode_nal_sei_prefix(gb, logctx, s, ps, payload_type, payload_size);
-+ } else { /* nal_unit_type == NAL_SEI_SUFFIX */
-+ return decode_nal_sei_suffix(gb, logctx, s, payload_type, payload_size);
-+ }
-+}
-+
-+static int more_rbsp_data(GetBitContext *gb)
-+{
-+ return get_bits_left(gb) > 0 && show_bits(gb, 8) != 0x80;
-+}
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+ const HEVCRpiParamSets *ps, int type)
-+{
-+ int ret;
-+
-+ do {
-+ ret = decode_nal_sei_message(gb, logctx, s, ps, type);
-+ if (ret < 0)
-+ return ret;
-+ } while (more_rbsp_data(gb));
-+ return 1;
-+}
-+
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s)
-+{
-+ s->a53_caption.a53_caption_size = 0;
-+ av_freep(&s->a53_caption.a53_caption);
-+}
-diff --git a/libavcodec/rpi_hevc_sei.h b/libavcodec/rpi_hevc_sei.h
-new file mode 100644
-index 0000000000..d4ac348df9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_sei.h
-@@ -0,0 +1,135 @@
-+/*
-+ * HEVC Supplementary Enhancement Information messages
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVC_SEI_H
-+#define AVCODEC_RPI_HEVC_SEI_H
-+
-+#include <stdint.h>
-+
-+#include "libavutil/md5.h"
-+
-+#include "get_bits.h"
-+
-+/**
-+ * SEI message types
-+ */
-+typedef enum {
-+ HEVC_SEI_TYPE_BUFFERING_PERIOD = 0,
-+ HEVC_SEI_TYPE_PICTURE_TIMING = 1,
-+ HEVC_SEI_TYPE_PAN_SCAN_RECT = 2,
-+ HEVC_SEI_TYPE_FILLER_PAYLOAD = 3,
-+ HEVC_SEI_TYPE_USER_DATA_REGISTERED_ITU_T_T35 = 4,
-+ HEVC_SEI_TYPE_USER_DATA_UNREGISTERED = 5,
-+ HEVC_SEI_TYPE_RECOVERY_POINT = 6,
-+ HEVC_SEI_TYPE_SCENE_INFO = 9,
-+ HEVC_SEI_TYPE_FULL_FRAME_SNAPSHOT = 15,
-+ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
-+ HEVC_SEI_TYPE_PROGRESSIVE_REFINEMENT_SEGMENT_END = 17,
-+ HEVC_SEI_TYPE_FILM_GRAIN_CHARACTERISTICS = 19,
-+ HEVC_SEI_TYPE_POST_FILTER_HINT = 22,
-+ HEVC_SEI_TYPE_TONE_MAPPING_INFO = 23,
-+ HEVC_SEI_TYPE_FRAME_PACKING = 45,
-+ HEVC_SEI_TYPE_DISPLAY_ORIENTATION = 47,
-+ HEVC_SEI_TYPE_SOP_DESCRIPTION = 128,
-+ HEVC_SEI_TYPE_ACTIVE_PARAMETER_SETS = 129,
-+ HEVC_SEI_TYPE_DECODING_UNIT_INFO = 130,
-+ HEVC_SEI_TYPE_TEMPORAL_LEVEL0_INDEX = 131,
-+ HEVC_SEI_TYPE_DECODED_PICTURE_HASH = 132,
-+ HEVC_SEI_TYPE_SCALABLE_NESTING = 133,
-+ HEVC_SEI_TYPE_REGION_REFRESH_INFO = 134,
-+ HEVC_SEI_TYPE_MASTERING_DISPLAY_INFO = 137,
-+ HEVC_SEI_TYPE_CONTENT_LIGHT_LEVEL_INFO = 144,
-+ HEVC_SEI_TYPE_ALTERNATIVE_TRANSFER_CHARACTERISTICS = 147,
-+} HEVC_SEI_Type;
-+
-+typedef struct HEVCSEIPictureHash {
-+ uint8_t md5[3][16];
-+ uint8_t is_md5;
-+} HEVCSEIPictureHash;
-+
-+typedef struct HEVCSEIFramePacking {
-+ int present;
-+ int arrangement_type;
-+ int content_interpretation_type;
-+ int quincunx_subsampling;
-+ int current_frame_is_frame0_flag;
-+} HEVCSEIFramePacking;
-+
-+typedef struct HEVCSEIDisplayOrientation {
-+ int present;
-+ int anticlockwise_rotation;
-+ int hflip, vflip;
-+} HEVCSEIDisplayOrientation;
-+
-+typedef struct HEVCSEIPictureTiming {
-+ int picture_struct;
-+} HEVCSEIPictureTiming;
-+
-+typedef struct HEVCSEIA53Caption {
-+ int a53_caption_size;
-+ uint8_t *a53_caption;
-+} HEVCSEIA53Caption;
-+
-+typedef struct HEVCSEIMasteringDisplay {
-+ int present;
-+ uint16_t display_primaries[3][2];
-+ uint16_t white_point[2];
-+ uint32_t max_luminance;
-+ uint32_t min_luminance;
-+} HEVCSEIMasteringDisplay;
-+
-+typedef struct HEVCSEIContentLight {
-+ int present;
-+ uint16_t max_content_light_level;
-+ uint16_t max_pic_average_light_level;
-+} HEVCSEIContentLight;
-+
-+typedef struct HEVCSEIAlternativeTransfer {
-+ int present;
-+ int preferred_transfer_characteristics;
-+} HEVCSEIAlternativeTransfer;
-+
-+typedef struct HEVCSEIContext {
-+ HEVCSEIPictureHash picture_hash;
-+ HEVCSEIFramePacking frame_packing;
-+ HEVCSEIDisplayOrientation display_orientation;
-+ HEVCSEIPictureTiming picture_timing;
-+ HEVCSEIA53Caption a53_caption;
-+ HEVCSEIMasteringDisplay mastering_display;
-+ HEVCSEIContentLight content_light;
-+ int active_seq_parameter_set_id;
-+ HEVCSEIAlternativeTransfer alternative_transfer;
-+} HEVCSEIContext;
-+
-+struct HEVCRpiParamSets;
-+
-+int ff_hevc_rpi_decode_nal_sei(GetBitContext *gb, void *logctx, HEVCSEIContext *s,
-+ const struct HEVCRpiParamSets *ps, int type);
-+
-+/**
-+ * Reset SEI values that are stored on the Context.
-+ * e.g. Caption data that was extracted during NAL
-+ * parsing.
-+ *
-+ * @param s HEVCRpiContext.
-+ */
-+void ff_hevc_rpi_reset_sei(HEVCSEIContext *s);
-+
-+#endif /* AVCODEC_RPI_HEVC_SEI_H */
-diff --git a/libavcodec/rpi_hevc_shader.c b/libavcodec/rpi_hevc_shader.c
-new file mode 100644
-index 0000000000..23b49a99ae
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.c
-@@ -0,0 +1,1537 @@
-+#include "rpi_hevc_shader.h"
-+
-+#ifdef _MSC_VER
-+ #include <stdint.h>
-+ /* cast through uintptr_t to avoid warnings */
-+ #define POINTER_TO_UINT(X) ((unsigned int)(uintptr_t)(X))
-+#else
-+ #define POINTER_TO_UINT(X) ((unsigned int)(X))
-+#endif
-+
-+#ifdef __cplusplus
-+extern "C" { /* the types are probably wrong... */
-+#endif
-+#ifdef __cplusplus
-+}
-+#endif
-+
-+#ifdef _MSC_VER
-+__declspec(align(8))
-+#elif defined(__GNUC__)
-+__attribute__((aligned(8)))
-+#endif
-+unsigned int ff_hevc_rpi_shader[] = {
-+// ::mc_setup_c_q0
-+// ::mc_start
-+/* [0x00000000] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_setup_c_qn
-+/* [0x00000008] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
-+/* [0x00000010] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000018] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif
-+/* [0x00000020] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00000028] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00000030] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00000038] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000040] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000048] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000050] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000058] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000060] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
-+/* [0x00000068] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x00000070] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000078] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x00000080] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x00000088] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
-+/* [0x00000090] */ 0x0c9a7180, 0x100210a7, // add rb_elem_x, r0, elem_num
-+/* [0x00000098] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000000a0] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x000000a8] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
-+/* [0x000000b0] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000000b8] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000000c0] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000000c8] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000000d0] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x000000d8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x000000e0] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif
-+/* [0x000000e8] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x000000f0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x000000f8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000100] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000108] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000110] */ 0x159e7040, 0x10020827, // or r0, r0, r1
-+/* [0x00000118] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000120] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000128] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000130] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000138] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000140] */ 0x11001dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00000148] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
-+/* [0x00000150] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif
-+/* [0x00000158] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000160] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000168] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000170] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00000178] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD
-+/* [0x00000180] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000188] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
-+/* [0x00000190] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y
-+// :1
-+/* [0x00000198] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000001a0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000001a8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001b0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x000001b8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
-+/* [0x000001c0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000001c8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000001d0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000001d8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x000001e0] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
-+/* [0x000001e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000001f0] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
-+/* [0x000001f8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000200] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
-+/* [0x00000208] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
-+/* [0x00000210] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
-+// ::mc_filter_c_p
-+/* [0x00000218] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00000220] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
-+/* [0x00000228] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
-+/* [0x00000230] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
-+/* [0x00000238] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
-+/* [0x00000240] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000248] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
-+/* [0x00000250] */ 0x119c31c0, 0xd0220567, // shl vrx_xshift_next, r0, 3
-+/* [0x00000258] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000260] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000268] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000270] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
-+/* [0x00000278] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
-+/* [0x00000280] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000288] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000290] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000298] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
-+/* [0x000002a0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
-+/* [0x000002a8] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
-+/* [0x000002b0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000002b8] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000002c0] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
-+/* [0x000002c8] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000002d0] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
-+// :1
-+/* [0x000002d8] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
-+/* [0x000002e0] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
-+/* [0x000002e8] */ 0x8e4485f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+/* [0x000002f0] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
-+/* [0x000002f8] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
-+/* [0x00000300] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+/* [0x00000308] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
-+/* [0x00000310] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
-+/* [0x00000318] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask
-+/* [0x00000320] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
-+/* [0x00000328] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
-+/* [0x00000330] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00000338] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000340] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00000348] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000350] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000358] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
-+/* [0x00000360] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
-+/* [0x00000368] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
-+/* [0x00000370] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
-+/* [0x00000378] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
-+/* [0x00000380] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000388] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
-+/* [0x00000390] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000398] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+/* [0x000003a0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
-+/* [0x000003a8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000003b0] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000003b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x000003c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x000003c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000003d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000003d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x000003e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x000003e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000003f0] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000003f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000400] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000408] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_p_l1
-+/* [0x00000410] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00000418] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
-+/* [0x00000420] */ 0xf1081dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
-+/* [0x00000428] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
-+/* [0x00000430] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
-+/* [0x00000438] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00000440] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
-+/* [0x00000448] */ 0x119c31c0, 0xd0021067, // shl vrx_xshift_next, r0, 3
-+/* [0x00000450] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000458] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000460] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000468] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
-+/* [0x00000470] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
-+/* [0x00000478] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000480] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000488] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00000490] */ 0x918073f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
-+/* [0x00000498] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
-+/* [0x000004a0] */ 0x910d01f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
-+/* [0x000004a8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000004b0] */ 0x5158c3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000004b8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
-+/* [0x000004c0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x000004c8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
-+// :1
-+/* [0x000004d0] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
-+/* [0x000004d8] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
-+/* [0x000004e0] */ 0x8e4485f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+/* [0x000004e8] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
-+/* [0x000004f0] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
-+/* [0x000004f8] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+/* [0x00000500] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
-+/* [0x00000508] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
-+/* [0x00000510] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax
-+/* [0x00000518] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
-+/* [0x00000520] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
-+/* [0x00000528] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00000530] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000538] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00000540] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000548] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000550] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
-+/* [0x00000558] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
-+/* [0x00000560] */ 0x8f1c05f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
-+/* [0x00000568] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
-+/* [0x00000570] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
-+/* [0x00000578] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00000580] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
-+/* [0x00000588] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000590] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+/* [0x00000598] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
-+/* [0x000005a0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000005a8] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000005b0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x000005b8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x000005c0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000005c8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000005d0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x000005d8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x000005e0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000005e8] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x000005f0] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000005f8] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000600] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c_b
-+/* [0x00000608] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00000610] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
-+/* [0x00000618] */ 0xf1081dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
-+/* [0x00000620] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+/* [0x00000628] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
-+/* [0x00000630] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+/* [0x00000638] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
-+/* [0x00000640] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000648] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
-+/* [0x00000650] */ 0x54402077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
-+/* [0x00000658] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000660] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
-+/* [0x00000668] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000670] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000678] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00000680] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00000688] */ 0x918073f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
-+/* [0x00000690] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
-+/* [0x00000698] */ 0x910d01f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
-+/* [0x000006a0] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x000006a8] */ 0x918011f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif
-+/* [0x000006b0] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
-+/* [0x000006b8] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
-+/* [0x000006c0] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y
-+/* [0x000006c8] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add
-+/* [0x000006d0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x000006d8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x000006e0] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4
-+/* [0x000006e8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x000006f0] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
-+/* [0x000006f8] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
-+/* [0x00000700] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00000708] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00000710] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1
-+/* [0x00000718] */ 0x910cd3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
-+/* [0x00000720] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif
-+/* [0x00000728] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d
-+// :1
-+/* [0x00000730] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
-+/* [0x00000738] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00000740] */ 0x8e4c85f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00000748] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
-+/* [0x00000750] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
-+/* [0x00000758] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
-+/* [0x00000760] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
-+/* [0x00000768] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+/* [0x00000770] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
-+/* [0x00000778] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
-+/* [0x00000780] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
-+/* [0x00000788] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00000790] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000798] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x000007a0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000007a8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
-+/* [0x000007b0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6
-+/* [0x000007b8] */ 0x8e4485f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
-+/* [0x000007c0] */ 0x8e1c01f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7
-+/* [0x000007c8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
-+/* [0x000007d0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+/* [0x000007d8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
-+/* [0x000007e0] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+/* [0x000007e8] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax
-+/* [0x000007f0] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
-+/* [0x000007f8] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
-+/* [0x00000800] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00000808] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00000810] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00000818] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00000820] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00000828] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
-+/* [0x00000830] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
-+/* [0x00000838] */ 0x8e2c05f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00000840] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b
-+/* [0x00000848] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4
-+/* [0x00000850] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+/* [0x00000858] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7
-+/* [0x00000860] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+/* [0x00000868] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11
-+/* [0x00000870] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00000878] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00000880] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00000888] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00000890] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
-+/* [0x00000898] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height
-+/* [0x000008a0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+/* [0x000008a8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000008b0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x000008b8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x000008c0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x000008c8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000008d0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000008d8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x000008e0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x000008e8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000008f0] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x000008f8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00000900] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00000908] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_sync_q0
-+/* [0x00000910] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000918] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000920] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000928] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000930] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000938] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000940] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000948] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000950] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q1
-+/* [0x00000958] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000960] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000968] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000970] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000978] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000980] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q2
-+/* [0x00000988] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000990] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000998] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009a0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
-+/* [0x000009a8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000009b0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q3
-+/* [0x000009b8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009c0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009c8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000009d0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
-+/* [0x000009d8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000009e0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q4
-+/* [0x000009e8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000009f0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000009f8] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000a00] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000a08] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000a10] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a18] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000a20] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000a28] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q5
-+/* [0x00000a30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a40] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a48] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000a50] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000a58] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q6
-+/* [0x00000a60] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a68] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000a70] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000a78] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000a80] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000a88] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q7
-+/* [0x00000a90] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000a98] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000aa0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000aa8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000ab0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000ab8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync_q8
-+/* [0x00000ac0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000ac8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000ad0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000ad8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000ae0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000ae8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000af0] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000af8] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000b00] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q9
-+/* [0x00000b08] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b10] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b18] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b20] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000b28] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000b30] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q10
-+/* [0x00000b38] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b40] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b48] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b50] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000b58] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000b60] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync_q11
-+/* [0x00000b68] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000b70] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000b78] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000b80] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00000b88] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000b90] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_qn
-+// ::mc_exit_y_qn
-+/* [0x00000b98] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000ba0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000ba8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
-+/* [0x00000bb0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
-+/* [0x00000bb8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000bc0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000bc8] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x00000bd0] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00000bd8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c_q0
-+// ::mc_exit_y_q0
-+/* [0x00000be0] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00000be8] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000bf0] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
-+/* [0x00000bf8] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
-+/* [0x00000c00] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000c08] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00000c10] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00000c18] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x00000c20] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00000c28] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y_q0
-+/* [0x00000c30] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_setup_y_qn
-+/* [0x00000c38] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
-+/* [0x00000c40] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00000c48] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00000c50] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00000c58] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif
-+/* [0x00000c60] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00000c68] */ 0x000000ff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00000c70] */ 0x001000ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00000c78] */ 0x00004000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x00000c80] */ 0x4000000e, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x00000c88] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x00000c90] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x00000c98] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x00000ca0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x00000ca8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
-+/* [0x00000cb0] */ 0x0d0c1dc0, 0xd40216a7, // sub rb_max_x, ra3.16b, 1
-+/* [0x00000cb8] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x00000cc0] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif
-+/* [0x00000cc8] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x00000cd0] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch
-+/* [0x00000cd8] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00000ce0] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000ce8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000cf0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000cf8] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
-+/* [0x00000d00] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00000d08] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d10] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d18] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d20] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00000d28] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000d30] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00000d38] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000d40] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000d48] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000d50] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00000d58] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000d60] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000d68] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x00000d70] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
-+/* [0x00000d78] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
-+// :1
-+/* [0x00000d80] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00000d88] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x00000d90] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000d98] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x00000da0] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
-+/* [0x00000da8] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00000db0] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00000db8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00000dc0] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x00000dc8] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
-+/* [0x00000dd0] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00000dd8] */ 0x0f9c25c0, 0xd0020867, // asr r1, r2, 2
-+/* [0x00000de0] */ 0x119c63c0, 0xd0020867, // shl r1, r1, 6
-+/* [0x00000de8] */ 0x149c35c0, 0xd0020827, // and r0, r2, 3
-+/* [0x00000df0] */ 0x159e7040, 0x10020827, // or r0, r0, r1
-+/* [0x00000df8] */ 0x00004800, 0xe0020867, // mov r1, vpm_setup(0, 4, h8p(0, 0))
-+/* [0x00000e00] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00000e08] */ 0x80004004, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0))
-+/* [0x00000e10] */ 0x119c51c0, 0xd0020827, // shl r0, r0, 5
-+/* [0x00000e18] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00000e20] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00000e28] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
-+/* [0x00000e30] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000e38] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
-+/* [0x00000e40] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
-+/* [0x00000e48] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
-+// :per_block_setup_8
-+/* [0x00000e50] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+/* [0x00000e58] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00000e60] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00000e68] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00000e70] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
-+/* [0x00000e78] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
-+/* [0x00000e80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000e88] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
-+/* [0x00000e90] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x00000e98] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00000ea0] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
-+/* [0x00000ea8] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
-+/* [0x00000eb0] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00000eb8] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
-+/* [0x00000ec0] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
-+/* [0x00000ec8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00000ed0] */ 0x4c401077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
-+/* [0x00000ed8] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00000ee0] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00000ee8] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00000ef0] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00000ef8] */ 0x916471f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add
-+/* [0x00000f00] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00000f08] */ 0x916501f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val
-+/* [0x00000f10] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00000f18] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00000f20] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255
-+/* [0x00000f28] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00000f30] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00000f38] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00000f40] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00000f48] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00000f50] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x00000f58] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
-+/* [0x00000f60] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
-+/* [0x00000f68] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x00000f70] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif
-+/* [0x00000f78] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
-+/* [0x00000f80] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x00000f88] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x00000f90] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x00000f98] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x00000fa0] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x00000fa8] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x00000fb0] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8
-+/* [0x00000fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00000fc0] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00000fc8] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif
-+/* [0x00000fd0] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
-+// ::mc_filter_y_pxx
-+/* [0x00000fd8] */ 0xfffffe58, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00000fe0] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
-+/* [0x00000fe8] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
-+/* [0x00000ff0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00000ff8] */ 0x1158cdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00001000] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001008] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001010] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
-+/* [0x00001018] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
-+/* [0x00001020] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
-+/* [0x00001028] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
-+/* [0x00001030] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
-+/* [0x00001038] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
-+/* [0x00001040] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
-+/* [0x00001048] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
-+/* [0x00001050] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
-+/* [0x00001058] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
-+/* [0x00001060] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
-+/* [0x00001068] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+/* [0x00001070] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
-+/* [0x00001078] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+/* [0x00001080] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+/* [0x00001088] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00001090] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001098] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+/* [0x000010a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000010a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+/* [0x000010b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000010b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+/* [0x000010c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000010c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+/* [0x000010d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000010d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+/* [0x000010e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000010e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000010f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+/* [0x000010f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
-+/* [0x00001100] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001108] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+/* [0x00001110] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+/* [0x00001118] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
-+/* [0x00001120] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+/* [0x00001128] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
-+/* [0x00001130] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height
-+/* [0x00001138] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001140] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
-+/* [0x00001148] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001150] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001158] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001160] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
-+/* [0x00001168] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001170] */ 0x0f9cd3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001178] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00001180] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00001188] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00001190] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001198] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x000011a0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x000011a8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000011b0] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x000011b8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000011c0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000011c8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_bxx
-+/* [0x000011d0] */ 0xfffffc60, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x000011d8] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
-+/* [0x000011e0] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
-+/* [0x000011e8] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x000011f0] */ 0x1158ddc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000011f8] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001200] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x00001208] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00001210] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
-+/* [0x00001218] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
-+/* [0x00001220] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
-+/* [0x00001228] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
-+/* [0x00001230] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
-+/* [0x00001238] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
-+/* [0x00001240] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
-+/* [0x00001248] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
-+/* [0x00001250] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
-+/* [0x00001258] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
-+/* [0x00001260] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
-+/* [0x00001268] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+/* [0x00001270] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
-+/* [0x00001278] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+/* [0x00001280] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+/* [0x00001288] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00001290] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00001298] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+/* [0x000012a0] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000012a8] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+/* [0x000012b0] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x000012b8] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+/* [0x000012c0] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x000012c8] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+/* [0x000012d0] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x000012d8] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+/* [0x000012e0] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x000012e8] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000012f0] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+/* [0x000012f8] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
-+/* [0x00001300] */ 0x8f2c05f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001308] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+/* [0x00001310] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+/* [0x00001318] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
-+/* [0x00001320] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+/* [0x00001328] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
-+/* [0x00001330] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00001338] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
-+/* [0x00001340] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00001348] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001350] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001358] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001360] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
-+/* [0x00001368] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8
-+/* [0x00001370] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
-+/* [0x00001378] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001380] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch
-+/* [0x00001388] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00001390] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3
-+/* [0x00001398] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000013a0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000013a8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x000013b0] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x000013b8] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000013c0] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x000013c8] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x000013d0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x000013d8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_p00
-+/* [0x000013e0] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
-+/* [0x000013e8] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x000013f0] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif
-+/* [0x000013f8] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a
-+/* [0x00001400] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif
-+/* [0x00001408] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00001410] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00001418] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001420] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00001428] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00001430] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
-+/* [0x00001438] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00001440] */ 0x11400dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00001448] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00001450] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00001458] */ 0x919c71c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
-+/* [0x00001460] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00001468] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x00001470] */ 0x918101f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif
-+/* [0x00001478] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x00001480] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
-+/* [0x00001488] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
-+/* [0x00001490] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+/* [0x00001498] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000014a0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+/* [0x000014a8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x000014b0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
-+/* [0x000014b8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x000014c0] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
-+/* [0x000014c8] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+/* [0x000014d0] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000014d8] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x000014e0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x000014e8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x000014f0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000014f8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001500] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00001508] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00001510] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001518] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001520] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001528] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001530] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y_b00
-+/* [0x00001538] */ 0xfffff8f8, 0xf0f807a7, // brr ra_link, r:per_block_setup_8
-+/* [0x00001540] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
-+/* [0x00001548] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
-+/* [0x00001550] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001558] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00001560] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0
-+/* [0x00001568] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00001570] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00001578] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
-+/* [0x00001580] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00001588] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+/* [0x00001590] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00001598] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+/* [0x000015a0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x000015a8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
-+/* [0x000015b0] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x000015b8] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x000015c0] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+/* [0x000015c8] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax
-+/* [0x000015d0] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x000015d8] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x000015e0] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
-+/* [0x000015e8] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
-+/* [0x000015f0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+/* [0x000015f8] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001600] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00001608] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00001610] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00001618] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00001620] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001628] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00001630] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00001638] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001640] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001648] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001650] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001658] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_setup_c10_q0
-+/* [0x00001660] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_setup_c10_qn
-+/* [0x00001668] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
-+/* [0x00001670] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00001678] */ 0x9181e1f6, 0xd00250d8, // shl rb_ef, r0, i_shift30 ; mov ra_base, unif
-+/* [0x00001680] */ 0x0d801dc0, 0xd0020827, // sub r0, unif, 1
-+/* [0x00001688] */ 0x119c21c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x00001690] */ 0x0d801dc0, 0xd00217a7, // sub rb_max_y, unif, 1
-+/* [0x00001698] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x000016a0] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x000016a8] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x000016b0] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000016b8] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000016c0] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
-+/* [0x000016c8] */ 0x15827d80, 0x10021427, // mov rb_pitch, unif
-+/* [0x000016d0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000016d8] */ 0x0c9d03c0, 0x10021667, // add rb_dma1_base, r1, rb_pitch
-+/* [0x000016e0] */ 0x14981f80, 0xd0020827, // and r0, 1, elem_num
-+/* [0x000016e8] */ 0x409c5007, 0xd00049e0, // nop ; mul24 r0, r0, 5
-+/* [0x000016f0] */ 0x0c9a7180, 0x10020827, // add r0, r0, elem_num
-+/* [0x000016f8] */ 0x0c9e7000, 0x100210a7, // add rb_elem_x, r0, r0
-+/* [0x00001700] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x00001708] */ 0x0c9c21c0, 0x10020827, // add r0, r0, rb_elem_x
-+/* [0x00001710] */ 0x930001f6, 0xd2225811, // max r0, r0, 0 ; mov ra_y, ra0.16a
-+/* [0x00001718] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00001720] */ 0x00000000, 0xe0224541, // mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
-+/* [0x00001728] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x00001730] */ 0x149e7040, 0x10020867, // and r1, r0, r1
-+/* [0x00001738] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00001740] */ 0x8c827076, 0x10025800, // add r0, r0, r1 ; mov ra0, unif
-+/* [0x00001748] */ 0x0c627c00, 0x10020627, // add ra_base, ra_base, r0
-+/* [0x00001750] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00001758] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00001760] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00001768] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00001770] */ 0x159e7040, 0x10020827, // or r0, r0, r1
-+/* [0x00001778] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00001780] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00001788] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00001790] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00001798] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x000017a0] */ 0x11002dc0, 0xd4020827, // shl r0, ra0.16b, v_x_shift
-+/* [0x000017a8] */ 0x8c0021f6, 0x12125811, // add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a
-+/* [0x000017b0] */ 0x938001f6, 0xd002480f, // max r0, r0, 0 ; mov rb_base2, unif
-+/* [0x000017b8] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000017c0] */ 0x0d510dc0, 0x18020867, // sub r1, ra_k0, rb_pitch
-+/* [0x000017c8] */ 0x949c307f, 0xd0024863, // and r1, r0, r1 ; mov r3, PREREAD
-+/* [0x000017d0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x000017d8] */ 0x8c467076, 0x12024822, // add r0, r0, r1 ; mov r2, ra_y2
-+/* [0x000017e0] */ 0x8c44fe36, 0x140253e0, // add rb_base2, rb_base2, r0 ; mov r0, ra_y
-+// :1
-+/* [0x000017e8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000017f0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000017f8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001800] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x00001808] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
-+/* [0x00001810] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x00001818] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00001820] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x00001828] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x00001830] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
-+/* [0x00001838] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001840] */ 0x00000000, 0xe0024104, // mov ra4, 0 ; mov rb4, 0
-+/* [0x00001848] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001850] */ 0x00000000, 0xe0024145, // mov ra5, 0 ; mov rb5, 0
-+/* [0x00001858] */ 0x00000000, 0xe0024186, // mov ra6, 0 ; mov rb6, 0
-+/* [0x00001860] */ 0x00000000, 0xe00241c7, // mov ra7, 0 ; mov rb7, 0
-+// ::mc_filter_c10_p
-+/* [0x00001868] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00001870] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
-+/* [0x00001878] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
-+/* [0x00001880] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
-+/* [0x00001888] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
-+/* [0x00001890] */ 0x93567176, 0x14024800, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001898] */ 0x9209a1f6, 0x12225813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
-+/* [0x000018a0] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
-+/* [0x000018a8] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x000018b0] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
-+/* [0x000018b8] */ 0x8c427636, 0x120246a1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
-+/* [0x000018c0] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x000018c8] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x000018d0] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x000018d8] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
-+/* [0x000018e0] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
-+/* [0x000018e8] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
-+/* [0x000018f0] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x000018f8] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001900] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
-+/* [0x00001908] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001910] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
-+// :1
-+/* [0x00001918] */ 0x8d151bf6, 0xa00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
-+/* [0x00001920] */ 0x8e4c09f6, 0x140288a3, // shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
-+/* [0x00001928] */ 0x8e4505f6, 0xd402c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+/* [0x00001930] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
-+/* [0x00001938] */ 0x8c531789, 0xda224460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
-+/* [0x00001940] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+/* [0x00001948] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
-+/* [0x00001950] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
-+/* [0x00001958] */ 0x8c618cc7, 0x10024e20, // add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask
-+/* [0x00001960] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
-+/* [0x00001968] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
-+/* [0x00001970] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00001978] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001980] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00001988] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001990] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001998] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
-+/* [0x000019a0] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
-+/* [0x000019a8] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
-+/* [0x000019b0] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
-+/* [0x000019b8] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
-+/* [0x000019c0] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x000019c8] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
-+/* [0x000019d0] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000019d8] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+/* [0x000019e0] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
-+/* [0x000019e8] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000019f0] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000019f8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00001a00] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00001a08] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00001a10] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001a18] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00001a20] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00001a28] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001a30] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001a38] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001a40] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001a48] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_p_l1
-+/* [0x00001a50] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00001a58] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
-+/* [0x00001a60] */ 0xf1082dc0, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0
-+/* [0x00001a68] */ 0x8c8021f6, 0x10025810, // add r0, r0, rb_elem_x ; mov ra_width_height, unif
-+/* [0x00001a70] */ 0x8d810bf6, 0x10025840, // sub r1, r5, rb_pitch ; mov ra0, unif
-+/* [0x00001a78] */ 0x939c117f, 0x10125815, // max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
-+/* [0x00001a80] */ 0x9209a1f6, 0x12125813, // min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
-+/* [0x00001a88] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001a90] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00001a98] */ 0x8c827076, 0x10025803, // add r0, r0, r1 ; mov ra3, unif
-+/* [0x00001aa0] */ 0x8c427636, 0x120254e1, // add vrx_base_next, r3, r0 ; mov r1, ra_height
-+/* [0x00001aa8] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ab0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ab8] */ 0x8c81f3f6, 0xd0039496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif
-+/* [0x00001ac0] */ 0x918083f6, 0xd002581c, // shl r0, r1, v_dma_h_shift ; mov ra_dest, unif
-+/* [0x00001ac8] */ 0x8c6670b6, 0x14024822, // add r0, r0, r2 ; mov r2, ra_fir_off_val
-+/* [0x00001ad0] */ 0x910cf1f6, 0xdc02480a, // shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c
-+/* [0x00001ad8] */ 0x8c59b1f6, 0x140246e1, // add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0
-+/* [0x00001ae0] */ 0x5158a3d6, 0xd2024860, // shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00001ae8] */ 0x8d667236, 0x14025320, // sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
-+/* [0x00001af0] */ 0x8c59cc3f, 0xd21245a5, // add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4
-+/* [0x00001af8] */ 0x950e0dbf, 0x1e0252de, // mov rb11, ra3.8d ; mov ra_link, unif
-+// :1
-+/* [0x00001b00] */ 0x8d151bf6, 0xb00269c4, // sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
-+/* [0x00001b08] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
-+/* [0x00001b10] */ 0x8e4505f6, 0xd202c863, // shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+/* [0x00001b18] */ 0x8c4c3ff6, 0x1202a9e3, // add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next
-+/* [0x00001b20] */ 0x8c531789, 0xda124460, // add vra_y, r3, ra_k1 ; mov r0, r1 << 15
-+/* [0x00001b28] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+/* [0x00001b30] */ 0x929de7d2, 0x1003c8e0, // min r3, r3, rb_max_y ; mov.ifnc r0, r2
-+/* [0x00001b38] */ 0x545d039f, 0x12024863, // and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
-+/* [0x00001b40] */ 0x8c5cfec6, 0x12024f20, // add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax
-+/* [0x00001b48] */ 0x4c001bf0, 0xd8025963, // add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
-+/* [0x00001b50] */ 0x4d01fef1, 0x1e0248a3, // sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
-+/* [0x00001b58] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00001b60] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001b68] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00001b70] */ 0x4c032b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001b78] */ 0xffffff68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001b80] */ 0x4c1ca4f7, 0x100248a0, // add r2, r2, r3 ; mul24 r0, ra7, rb10
-+/* [0x00001b88] */ 0x550c6ffe, 0x1a024161, // mov ra5, rb6 ; mul24 r1, rb6, ra3.8b
-+/* [0x00001b90] */ 0x8f1c25f6, 0xd00241c6, // asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
-+/* [0x00001b98] */ 0x4c0c423e, 0x18024860, // add r1, r1, r0 ; mul24 r0, rb4, ra3.8a
-+/* [0x00001ba0] */ 0x4d1cb237, 0x10024860, // sub r1, r1, r0 ; mul24 r0, ra7, rb11
-+/* [0x00001ba8] */ 0x0d9e7200, 0x10020867, // sub r1, r1, r0
-+/* [0x00001bb0] */ 0x8f5c63f6, 0xdc024863, // asr r1, r1, 6 ; mov r3, ra_blk_height
-+/* [0x00001bb8] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001bc0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+/* [0x00001bc8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
-+/* [0x00001bd0] */ 0xffffff10, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001bd8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x00001be0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00001be8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00001bf0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00001bf8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001c00] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00001c08] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00001c10] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001c18] */ 0xfffffec8, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001c20] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001c28] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001c30] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_c10_b
-+/* [0x00001c38] */ 0x9581cff6, 0x10025c42, // mov vw_setup, rb_vpm_init ; mov ra2, unif
-+/* [0x00001c40] */ 0x8c803ff6, 0x100269e3, // add.setf -, rb_ef, rb_ef ; mov r3, unif
-+/* [0x00001c48] */ 0xf1082dc9, 0xd4024825, // shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1
-+/* [0x00001c50] */ 0x8c0821f6, 0x12225813, // add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+/* [0x00001c58] */ 0x8d810bf6, 0x10025850, // sub r1, r5, rb_pitch ; mov ra_width_height, unif
-+/* [0x00001c60] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+/* [0x00001c68] */ 0x9281a1f6, 0x10025800, // min r0, r0, rb_max_x ; mov ra0, unif
-+/* [0x00001c70] */ 0x9481c1f6, 0xd0025802, // and r0, r0, -4 ; mov ra2, unif
-+/* [0x00001c78] */ 0x54404077, 0xd4024862, // and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul
-+/* [0x00001c80] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00001c88] */ 0x8c427076, 0x12024821, // add r0, r0, r1 ; mov r1, ra_height
-+/* [0x00001c90] */ 0x8c9c163f, 0x10024680, // add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00001c98] */ 0x8d819eb6, 0x10025756, // sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00001ca0] */ 0x8c5dc3ce, 0xdc025461, // add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+/* [0x00001ca8] */ 0x8c59f3f6, 0xd4139496, // add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0
-+/* [0x00001cb0] */ 0x918083f6, 0xd0025803, // shl r0, r1, v_dma_h_shift ; mov ra3, unif
-+/* [0x00001cb8] */ 0x8c8270b6, 0x10024823, // add r0, r0, r2 ; mov r3, unif
-+/* [0x00001cc0] */ 0x910cf1f6, 0xd2125813, // shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a
-+/* [0x00001cc8] */ 0x8c0db1f6, 0x140246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b
-+/* [0x00001cd0] */ 0x918021f6, 0xd0025801, // shl r0, r0, v_x_shift ; mov ra1, unif
-+/* [0x00001cd8] */ 0x8c8021f6, 0x10025803, // add r0, r0, rb_elem_x ; mov ra3, unif
-+/* [0x00001ce0] */ 0x8d810bf6, 0x10025852, // sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif
-+/* [0x00001ce8] */ 0x939de17f, 0x10025809, // max r0, r0, r5 ; mov ra9, rb_max_y
-+/* [0x00001cf0] */ 0x9265a1f6, 0x14024822, // min r0, r0, rb_max_x ; mov r2, ra_kmul_add
-+/* [0x00001cf8] */ 0x9481c1f6, 0xd0039812, // and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif
-+/* [0x00001d00] */ 0x949dc07f, 0xd0024865, // and r1, r0, r1 ; mov r5rep, -4
-+/* [0x00001d08] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00001d10] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
-+/* [0x00001d18] */ 0x8c667636, 0x140254e0, // add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
-+/* [0x00001d20] */ 0x4c5a7c86, 0x121245a1, // add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00001d28] */ 0x4c4a7c86, 0x121244a0, // add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+/* [0x00001d30] */ 0x8c4a7076, 0x14024821, // add r0, r0, r1 ; mov r1, ra_wt_off_l1
-+/* [0x00001d38] */ 0x910cb3f6, 0xde02484b, // shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
-+/* [0x00001d40] */ 0x8d827236, 0x1002531e, // sub rb_wt_off, r1, r0 ; mov ra_link, unif
-+/* [0x00001d48] */ 0x95080ff6, 0x1e024287, // mov ra10, rb_xshift2 ; mov rb7, ra2.8d
-+// :1
-+/* [0x00001d50] */ 0x0d9d1bc0, 0xa00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
-+/* [0x00001d58] */ 0x8e5539bf, 0x1202888f, // shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00001d60] */ 0x8e4d05f6, 0xd0029851, // shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00001d68] */ 0x8c683ff6, 0x1002b9d8, // add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next
-+/* [0x00001d70] */ 0x8c441fb6, 0xd4224463, // add ra_y, 1, ra_y ; mov r3, ra_y
-+/* [0x00001d78] */ 0x93531789, 0xd80248e0, // max r3, r3, ra_k0 ; mov r0, r1 << 15
-+/* [0x00001d80] */ 0x9227f792, 0xd003c8e1, // min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
-+/* [0x00001d88] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+/* [0x00001d90] */ 0x8c618cc7, 0x10024e20, // add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask
-+/* [0x00001d98] */ 0x540183f0, 0x18024862, // and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
-+/* [0x00001da0] */ 0x4d01feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
-+/* [0x00001da8] */ 0x4d03e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00001db0] */ 0x40034031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001db8] */ 0x4c03c4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00001dc0] */ 0x40032031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001dc8] */ 0x4c0854fe, 0xb8025804, // add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
-+/* [0x00001dd0] */ 0x8e2869bf, 0x10024885, // shr r2, r4, ra10 ; mov rb5, rb6
-+/* [0x00001dd8] */ 0x8e4505f6, 0xd2024863, // shr r1, r2, v_v_shift ; mov r3, ra_y2
-+/* [0x00001de0] */ 0x8e1c21f6, 0xd00241c6, // shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7
-+/* [0x00001de8] */ 0x8c531789, 0xda124460, // add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
-+/* [0x00001df0] */ 0x9353f792, 0xd803c8e1, // max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+/* [0x00001df8] */ 0x925de7ce, 0x120248e1, // min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
-+/* [0x00001e00] */ 0x559d049f, 0x100e4823, // mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+/* [0x00001e08] */ 0x8c5cfec6, 0x12024f20, // add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax
-+/* [0x00001e10] */ 0x4c041bf0, 0xd8025962, // add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
-+/* [0x00001e18] */ 0x4d05feb1, 0x1e0248a3, // sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
-+/* [0x00001e20] */ 0x4d07e4f0, 0xda0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00001e28] */ 0x40074031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00001e30] */ 0x4c07c6b0, 0xdc0248a3, // add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00001e38] */ 0x4c072b71, 0xdc0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00001e40] */ 0xfffffef0, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001e48] */ 0x4c0c94fe, 0x180248a0, // add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
-+/* [0x00001e50] */ 0x550caffe, 0x1a025261, // mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
-+/* [0x00001e58] */ 0x8e2c25f6, 0xd00242ca, // shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00001e60] */ 0x4d08523e, 0x1a0248a1, // sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b
-+/* [0x00001e68] */ 0x8d112bf6, 0x100269e0, // sub.setf -, r5, rb_lcount ; mov r0, ra4
-+/* [0x00001e70] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+/* [0x00001e78] */ 0x4c1c7237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra7, rb7
-+/* [0x00001e80] */ 0x4d0ca23e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+/* [0x00001e88] */ 0x4c2cb437, 0x100248a0, // add r2, r2, r0 ; mul24 r0, ra11, rb11
-+/* [0x00001e90] */ 0x0d9e7400, 0x100208a7, // sub r2, r2, r0
-+/* [0x00001e98] */ 0x0e9c63c0, 0xd0020867, // shr r1, r1, 6
-+/* [0x00001ea0] */ 0x4e5865ce, 0xd20248a0, // shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00001ea8] */ 0x4c4a7456, 0x120248a1, // add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
-+/* [0x00001eb0] */ 0x4c667216, 0x14024862, // add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
-+/* [0x00001eb8] */ 0x8d5e72b6, 0x1c024863, // sub r1, r1, r2 ; mov r3, ra_blk_height
-+/* [0x00001ec0] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+/* [0x00001ec8] */ 0xfffffe68, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00001ed0] */ 0x0f667380, 0x18020867, // asr r1, r1, ra_wt_den_p7
-+/* [0x00001ed8] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00001ee0] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00001ee8] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00001ef0] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00001ef8] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00001f00] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00001f08] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00001f10] */ 0xfffffe20, 0xf0f809e7, // brr -, r:1b
-+/* [0x00001f18] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00001f20] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00001f28] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_sync10_q0
-+/* [0x00001f30] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f38] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f40] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00001f48] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00001f50] */ 0x00000010, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00001f58] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f60] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00001f68] */ 0x00000001, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00001f70] */ 0x0000000d, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q1
-+/* [0x00001f78] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001f80] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001f88] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001f90] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00001f98] */ 0x00000011, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00001fa0] */ 0x00000002, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q2
-+/* [0x00001fa8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fb0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fb8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001fc0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00001fc8] */ 0x00000012, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00001fd0] */ 0x00000003, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q3
-+/* [0x00001fd8] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00001fe0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00001fe8] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00001ff0] */ 0x00000000, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00001ff8] */ 0x00000013, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002000] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q4
-+/* [0x00002008] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002010] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002018] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002020] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002028] */ 0x00000014, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002030] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002038] */ 0x0000001d, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002040] */ 0x00000005, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00002048] */ 0x0000000e, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q5
-+/* [0x00002050] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002058] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002060] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002068] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00002070] */ 0x00000015, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002078] */ 0x00000006, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q6
-+/* [0x00002080] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002088] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002090] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002098] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
-+/* [0x000020a0] */ 0x00000016, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000020a8] */ 0x00000007, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q7
-+/* [0x000020b0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020b8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020c0] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000020c8] */ 0x00000004, 0xe80009e7, // mov dst, srel(i)
-+/* [0x000020d0] */ 0x00000017, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000020d8] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_sync10_q8
-+/* [0x000020e0] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x000020e8] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000020f0] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000020f8] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002100] */ 0x00000018, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002108] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002110] */ 0x0000001e, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002118] */ 0x00000009, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00002120] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q9
-+/* [0x00002128] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002130] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002138] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002140] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00002148] */ 0x00000019, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002150] */ 0x0000000a, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q10
-+/* [0x00002158] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002160] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002168] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002170] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
-+/* [0x00002178] */ 0x0000001a, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x00002180] */ 0x0000000b, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_sync10_q11
-+/* [0x00002188] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002190] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002198] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x000021a0] */ 0x00000008, 0xe80009e7, // mov dst, srel(i)
-+/* [0x000021a8] */ 0x0000001b, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000021b0] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_q0
-+// ::mc_exit_y10_q0
-+/* [0x000021b8] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x000021c0] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000021c8] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
-+/* [0x000021d0] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
-+/* [0x000021d8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000021e0] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x000021e8] */ 0x0000001c, 0xe80009e7, // mov dst, sacq(i)
-+/* [0x000021f0] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x000021f8] */ 0x00000001, 0xe00209a7, // mov interrupt, 1
-+/* [0x00002200] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_exit_c10_qn
-+// ::mc_exit_y10_qn
-+/* [0x00002208] */ 0x00000002, 0xe00228e7, // mov.setf r3, PREREAD - 1
-+// :1
-+/* [0x00002210] */ 0xffffffe0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x00002218] */ 0x009e7000, 0xa00009e7, // nop ; nop ; ldtmu0
-+/* [0x00002220] */ 0x009e7000, 0xb00009e7, // nop ; nop ; ldtmu1
-+/* [0x00002228] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x00002230] */ 0x159f2fc0, 0x100009e7, // mov -, vw_wait
-+/* [0x00002238] */ 0x009e7000, 0x300009e7, // nop ; nop ; thrend
-+/* [0x00002240] */ 0x009e7000, 0x100009e7, // nop
-+/* [0x00002248] */ 0x009e7000, 0x100009e7, // nop
-+// ::mc_setup_y10_q0
-+/* [0x00002250] */ 0x0000000c, 0xe80009e7, // mov dst, srel(i)
-+// ::mc_setup_y10_qn
-+/* [0x00002258] */ 0x95801ff6, 0xd0025900, // mov tmurs, 1 ; mov ra0, unif
-+/* [0x00002260] */ 0x15827d80, 0x10020267, // mov ra9, unif
-+/* [0x00002268] */ 0x15827d80, 0x10020067, // mov ra1, unif
-+/* [0x00002270] */ 0xaaaaff00, 0xe6020827, // mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+/* [0x00002278] */ 0x9181e1f6, 0xd00250cb, // shl rb_ef, r0, i_shift30 ; mov ra11, unif
-+/* [0x00002280] */ 0xff800100, 0xe0020527, // mov ra_kff800100, 0xff800100
-+/* [0x00002288] */ 0x0000ffff, 0xe0021627, // mov rb_pmask, v_pmask
-+/* [0x00002290] */ 0x000803ff, 0xe00205e7, // mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+/* [0x00002298] */ 0x00010000, 0xe00217e7, // mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+/* [0x000022a0] */ 0x4000000c, 0xe0020667, // mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+/* [0x000022a8] */ 0x050b0a00, 0xe0021567, // mov rb_y_coeffs_2, 0x050b0a00
-+/* [0x000022b0] */ 0x11283a40, 0xe00215a7, // mov rb_y_coeffs_3, 0x11283a40
-+/* [0x000022b8] */ 0x0a0b0500, 0xe00215e7, // mov rb_y_coeffs_5, 0x0a0b0500
-+/* [0x000022c0] */ 0x15827d80, 0x100200e7, // mov ra3, unif
-+/* [0x000022c8] */ 0x95803ff6, 0x10024754, // mov ra_ef, rb_ef ; mov rb_xpitch, unif
-+/* [0x000022d0] */ 0x0d0c1dc0, 0xd4020827, // sub r0, ra3.16b, 1
-+/* [0x000022d8] */ 0x119c11c0, 0xd00216a7, // shl rb_max_x, r0, v_x_shift
-+/* [0x000022e0] */ 0x0d0c1dc0, 0xd20217a7, // sub rb_max_y, ra3.16a, 1
-+/* [0x000022e8] */ 0x959a0dbf, 0x100248d0, // mov r3, elem_num ; mov rb_pitch, unif
-+/* [0x000022f0] */ 0xc0000000, 0xe0020867, // mov r1, vdw_setup_1(0)
-+/* [0x000022f8] */ 0x159d03c0, 0x10021667, // or rb_dma1_base, r1, rb_pitch
-+/* [0x00002300] */ 0x0c027cc0, 0x14020827, // add r0, ra0.16b, r3
-+/* [0x00002308] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002310] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002318] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002320] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002328] */ 0xf49dc1d2, 0xd0024822, // and r0, r0, -4 ; v8subs r2, r2, r2
-+/* [0x00002330] */ 0x0d9d05c0, 0x100208a7, // sub r2, r2, rb_pitch
-+/* [0x00002338] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002340] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00002348] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002350] */ 0x0c267c00, 0x10020627, // add ra_base, ra9, r0
-+/* [0x00002358] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x00002360] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002368] */ 0x139c01c0, 0xd0020827, // max r0, r0, 0
-+/* [0x00002370] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x00002378] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002380] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002388] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002390] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00002398] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000023a0] */ 0x0c2e7c00, 0x100213e7, // add rb_base2, ra11, r0
-+/* [0x000023a8] */ 0x80027036, 0x120049e0, // nop ; mov r0, ra0.16a
-+/* [0x000023b0] */ 0x95043ff6, 0xd20248e2, // mov r3, PREREAD ; mov r2, ra1.16a
-+// :1
-+/* [0x000023b8] */ 0x0d9c17c0, 0xd00228e7, // sub.setf r3, r3, 1
-+/* [0x000023c0] */ 0x139c01c0, 0xd0020867, // max r1, r0, 0
-+/* [0x000023c8] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023d0] */ 0x4c51018f, 0x1a024821, // add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x000023d8] */ 0x8c627c40, 0x10225e11, // add t0s, ra_base, r1 ; mov ra_y, r0
-+/* [0x000023e0] */ 0x139c05c0, 0xd0020867, // max r1, r2, 0
-+/* [0x000023e8] */ 0xffffffb0, 0xf03809e7, // brr.anynz -, r:1b
-+/* [0x000023f0] */ 0x129de3c0, 0x10020867, // min r1, r1, rb_max_y
-+/* [0x000023f8] */ 0x4c51058f, 0x1a0248a1, // add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+/* [0x00002400] */ 0x8c9cfe52, 0x10125f11, // add t1s, rb_base2, r1 ; mov ra_y2, r2
-+/* [0x00002408] */ 0x159e6fc0, 0x100208a7, // mov r2, qpu_num
-+/* [0x00002410] */ 0x0f9c15c0, 0xd0020867, // asr r1, r2, 1
-+/* [0x00002418] */ 0x119c43c0, 0xd0020867, // shl r1, r1, 4
-+/* [0x00002420] */ 0x149c15c0, 0xd0020827, // and r0, r2, 1
-+/* [0x00002428] */ 0x159e7040, 0x10020827, // or r0, r0, r1
-+/* [0x00002430] */ 0x00002900, 0xe0020867, // mov r1, vpm_setup(0, 2, h16p(0, 0))
-+/* [0x00002438] */ 0x0c9e7040, 0x10021727, // add r_vpm, r0, r1
-+/* [0x00002440] */ 0x80004002, 0xe0020867, // mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0))
-+/* [0x00002448] */ 0x119c61c0, 0xd0020827, // shl r0, r0, 6
-+/* [0x00002450] */ 0x0c9e7040, 0x100216e7, // add r_dma, r0, r1
-+/* [0x00002458] */ 0x15827d80, 0x100207a7, // mov ra_link, unif
-+/* [0x00002460] */ 0x00000000, 0xe0024208, // mov ra8, 0 ; mov rb8, 0
-+/* [0x00002468] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002470] */ 0x00000000, 0xe0024249, // mov ra9, 0 ; mov rb9, 0
-+/* [0x00002478] */ 0x00000000, 0xe002428a, // mov ra10, 0 ; mov rb10, 0
-+/* [0x00002480] */ 0x00000000, 0xe00242cb, // mov ra11, 0 ; mov rb11, 0
-+// :per_block_setup_10
-+/* [0x00002488] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002490] */ 0x93567176, 0x14125815, // max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+/* [0x00002498] */ 0x129da1c0, 0x10020827, // min r0, r0, rb_max_x
-+/* [0x000024a0] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x000024a8] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x000024b0] */ 0x8d810bf6, 0x1002589a, // sub r2, r5, rb_pitch ; mov ra_base_next, unif
-+/* [0x000024b8] */ 0x940270b6, 0x12225853, // and r1, r0, r2 ; mov ra_y_next, ra0.16a
-+/* [0x000024c0] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x000024c8] */ 0x8c827076, 0x10025801, // add r0, r0, r1 ; mov ra1, unif
-+/* [0x000024d0] */ 0x0c6a7c00, 0x100206a7, // add ra_base_next, ra_base_next, r0
-+/* [0x000024d8] */ 0x0c067cc0, 0x14020827, // add r0, ra1.16b, r3
-+/* [0x000024e0] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x000024e8] */ 0x93067176, 0x12125813, // max r0, r0, r5 ; mov ra_y2_next, ra1.16a
-+/* [0x000024f0] */ 0x9281a1f6, 0x10024813, // min r0, r0, rb_max_x ; mov rb_base2_next, unif
-+/* [0x000024f8] */ 0x119c31c0, 0xd0021067, // shl rb_xshift2_next, r0, 3
-+/* [0x00002500] */ 0x9481c1f6, 0xd0025810, // and r0, r0, -4 ; mov ra_width_height, unif
-+/* [0x00002508] */ 0x949dc0bf, 0x10024871, // and r1, r0, r2 ; mov vw_setup, rb_vpm_init
-+/* [0x00002510] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00002518] */ 0x4c402077, 0xd4024821, // add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul
-+/* [0x00002520] */ 0x0c9d3e00, 0x100214e7, // add rb_base2_next, rb_base2_next, r0
-+/* [0x00002528] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002530] */ 0x8c5dc1c6, 0xdc025460, // add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00002538] */ 0x0c9df1c0, 0xd00214a7, // add rb_lcount, r0, (7-8)
-+/* [0x00002540] */ 0x916481f6, 0xd4024823, // shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add
-+/* [0x00002548] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x00002550] */ 0x9164f1f6, 0xd4024822, // shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val
-+/* [0x00002558] */ 0x8c81b1f6, 0x100246e0, // add ra_dma0, r0, rb_dma0_base ; mov r0, unif
-+/* [0x00002560] */ 0x918101f6, 0xd00a5816, // shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002568] */ 0x915031f6, 0xde024205, // shl ra8, r0, 3 ; mov rb5, ra_k255
-+/* [0x00002570] */ 0x01040400, 0xe0020867, // mov r1, 0x01040400
-+/* [0x00002578] */ 0x10227380, 0x1e5200a7, // ror ra2.8b, r1, ra8.8d
-+/* [0x00002580] */ 0x10227380, 0x1c520027, // ror ra0.8b, r1, ra8.8c
-+/* [0x00002588] */ 0x10215f80, 0x1e6200a7, // ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+/* [0x00002590] */ 0x10215f80, 0x1c620027, // ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+/* [0x00002598] */ 0x00010100, 0xe0020867, // mov r1,0x00010100
-+/* [0x000025a0] */ 0x902203bf, 0x1e025812, // ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif
-+/* [0x000025a8] */ 0x90205387, 0x1c424004, // ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
-+/* [0x000025b0] */ 0x914883f6, 0xd0031856, // shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1
-+/* [0x000025b8] */ 0x902203bf, 0x1e02581c, // ror r0, r1, ra8.8d ; mov ra_dest, unif
-+/* [0x000025c0] */ 0x90205387, 0x1c72404b, // ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
-+/* [0x000025c8] */ 0x10216f80, 0x1e7200a7, // ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+/* [0x000025d0] */ 0x10216f80, 0x1c720027, // ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+/* [0x000025d8] */ 0x10217f80, 0x1e5200e7, // ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+/* [0x000025e0] */ 0x10217f80, 0x1c520067, // ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+/* [0x000025e8] */ 0x04040100, 0xe0020867, // mov r1,0x04040100
-+/* [0x000025f0] */ 0x10227380, 0x1e6200e7, // ror ra3.8c, r1, ra8.8d
-+/* [0x000025f8] */ 0x902183bf, 0xdc624065, // ror ra1.8c, r1, ra8.8c ; mov r5rep, -8
-+/* [0x00002600] */ 0x00000000, 0xf0f7c9e7, // bra -, ra_link
-+/* [0x00002608] */ 0x3a281100, 0xe0020867, // mov r1,0x3a281100
-+/* [0x00002610] */ 0x902203bf, 0x1e02581e, // ror r0, r1, ra8.8d ; mov ra_link, unif
-+/* [0x00002618] */ 0x90205387, 0x1c424048, // ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
-+// ::mc_filter_y10_pxx
-+/* [0x00002620] */ 0xfffffe48, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002628] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
-+/* [0x00002630] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
-+/* [0x00002638] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002640] */ 0x1158adc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p5
-+/* [0x00002648] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x00002650] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x00002658] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
-+/* [0x00002660] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
-+/* [0x00002668] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
-+/* [0x00002670] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
-+/* [0x00002678] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
-+/* [0x00002680] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
-+/* [0x00002688] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
-+/* [0x00002690] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
-+/* [0x00002698] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
-+/* [0x000026a0] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
-+/* [0x000026a8] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
-+/* [0x000026b0] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+/* [0x000026b8] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
-+/* [0x000026c0] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+/* [0x000026c8] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+/* [0x000026d0] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+/* [0x000026d8] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x000026e0] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+/* [0x000026e8] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x000026f0] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+/* [0x000026f8] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002700] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+/* [0x00002708] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002710] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+/* [0x00002718] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002720] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+/* [0x00002728] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002730] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002738] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+/* [0x00002740] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
-+/* [0x00002748] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002750] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+/* [0x00002758] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+/* [0x00002760] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
-+/* [0x00002768] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+/* [0x00002770] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
-+/* [0x00002778] */ 0x8d5d1bf6, 0x1c0269e3, // sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height
-+/* [0x00002780] */ 0x8d1133bf, 0x1002884f, // sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002788] */ 0x8d6a7236, 0x10029858, // sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
-+/* [0x00002790] */ 0x8f4c63f6, 0xd0029851, // asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002798] */ 0x4d592bce, 0x120269e0, // sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x000027a0] */ 0x4c64c1ce, 0x14024821, // add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+/* [0x000027a8] */ 0xed427073, 0x12024860, // sub r1, r0, r1 ; v8subs r0, ra_height, r3
-+/* [0x000027b0] */ 0xfffffe88, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x000027b8] */ 0x0f9cb3c0, 0xd0020867, // asr r1, r1, i_wt_den_p6
-+/* [0x000027c0] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x000027c8] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x000027d0] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x000027d8] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x000027e0] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x000027e8] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x000027f0] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x000027f8] */ 0xfffffe40, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002800] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002808] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002810] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_p00
-+/* [0x00002818] */ 0x959a0ff6, 0x10024020, // mov ra0, unif ; mov r0, elem_num
-+/* [0x00002820] */ 0xf5567dad, 0x14124565, // mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5
-+/* [0x00002828] */ 0x8c020c3f, 0x1402581a, // add r0, ra0.16b, r0 ; mov ra_base_next, unif
-+/* [0x00002830] */ 0x119c11c0, 0xd0020827, // shl r0, r0, v_x_shift
-+/* [0x00002838] */ 0x93027176, 0x12225813, // max r0, r0, r5 ; mov ra_y_next, ra0.16a
-+/* [0x00002840] */ 0x9281a1f6, 0x10025810, // min r0, r0, rb_max_x ; mov ra_width_height, unif
-+/* [0x00002848] */ 0x119c31c0, 0xd0220567, // shl ra_xshift_next, r0, 3
-+/* [0x00002850] */ 0x149dc1c0, 0xd0020827, // and r0, r0, -4
-+/* [0x00002858] */ 0x8d810bf6, 0x10025896, // sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif
-+/* [0x00002860] */ 0x149e7080, 0x10020867, // and r1, r0, r2
-+/* [0x00002868] */ 0x569d404f, 0x10024821, // xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+/* [0x00002870] */ 0x8c827076, 0x1002581c, // add r0, r0, r1 ; mov ra_dest, unif
-+/* [0x00002878] */ 0x8c69cc3f, 0x100246b1, // add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init
-+/* [0x00002880] */ 0x11401dc0, 0xd4020867, // shl r1, ra_width, v_x_shift
-+/* [0x00002888] */ 0x8d419e76, 0x12025760, // sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+/* [0x00002890] */ 0x8d5c31c6, 0xdc025460, // sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
-+/* [0x00002898] */ 0x919c81c0, 0xd0024812, // shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
-+/* [0x000028a0] */ 0x0c9e7040, 0x10020827, // add r0, r0, r1
-+/* [0x000028a8] */ 0x1158edc0, 0xd4021327, // shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+/* [0x000028b0] */ 0x9180f1f6, 0xd002581e, // shl r0, r0, v_dma_wh_shift ; mov ra_link, unif
-+/* [0x000028b8] */ 0x0c9db1c0, 0x100206e7, // add ra_dma0, r0, rb_dma0_base
-+// :1
-+/* [0x000028c0] */ 0xcd511bee, 0x1a0269e5, // sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
-+/* [0x000028c8] */ 0x804e7036, 0xa42099d1, // nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
-+/* [0x000028d0] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+/* [0x000028d8] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x000028e0] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+/* [0x000028e8] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x000028f0] */ 0x8c618c87, 0x10024e20, // add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
-+/* [0x000028f8] */ 0x4d592bc6, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+/* [0x00002900] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
-+/* [0x00002908] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+/* [0x00002910] */ 0xffffff90, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002918] */ 0x0f9cf3c0, 0xd0020867, // asr r1, r1, DENOM + 8
-+/* [0x00002920] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00002928] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00002930] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00002938] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002940] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00002948] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00002950] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002958] */ 0xffffff48, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002960] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002968] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002970] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_bxx
-+/* [0x00002978] */ 0xfffffaf0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002980] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
-+/* [0x00002988] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
-+/* [0x00002990] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002998] */ 0x1158bdc0, 0xd4020867, // shl r1, ra_wt_off_l0, i_wt_den_p6
-+/* [0x000029a0] */ 0x4c5a7cd6, 0x121245a0, // add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+/* [0x000029a8] */ 0x4d4a7216, 0x12024860, // sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
-+/* [0x000029b0] */ 0x8d9c423f, 0x1042531d, // sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
-+// :1
-+/* [0x000029b8] */ 0x4c745dbe, 0x100279c4, // add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
-+/* [0x000029c0] */ 0x93440dff, 0xd40248a1, // max r2, ra_y, 0 ; mov r1, 0
-+/* [0x000029c8] */ 0x9251e5f6, 0x1a0248a3, // min r2, r2, rb_max_y ; mov r3, ra_k1
-+/* [0x000029d0] */ 0x4c450cd7, 0xa4224462, // add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
-+/* [0x000029d8] */ 0x8c606cbf, 0x10024e05, // add t0s, ra_base, r2 ; mov rb5, rb6
-+/* [0x000029e0] */ 0x8e5479bf, 0x12024806, // shr r0, r4, ra_xshift ; mov rb6, rb7
-+/* [0x000029e8] */ 0x93458c47, 0xb20248a0, // max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1
-+/* [0x000029f0] */ 0x8e2009f6, 0x10024847, // shr r1, r4, rb_xshift2 ; mov rb7, ra8
-+/* [0x000029f8] */ 0x925de5ce, 0x120248a1, // min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
-+/* [0x00002a00] */ 0x4c450cd7, 0x12124462, // add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
-+/* [0x00002a08] */ 0x8c24feb6, 0x10025f08, // add t1s, rb_base2, r2 ; mov ra8, ra9
-+/* [0x00002a10] */ 0x4c038af1, 0xd8025962, // add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+/* [0x00002a18] */ 0x5501fff0, 0x180348e2, // mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
-+/* [0x00002a20] */ 0x4d03f6b0, 0xda0248a3, // sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+/* [0x00002a28] */ 0x40037031, 0xda0109e3, // nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+/* [0x00002a30] */ 0x4c03e4f0, 0xdc0248a3, // add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+/* [0x00002a38] */ 0x40036031, 0xdc0109e3, // nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+/* [0x00002a40] */ 0x4d03d4f0, 0xde0248a3, // sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+/* [0x00002a48] */ 0x40035031, 0xde0109e3, // nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+/* [0x00002a50] */ 0x4c07c4f0, 0xd80248a3, // add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+/* [0x00002a58] */ 0x40074031, 0xd80109e3, // nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+/* [0x00002a60] */ 0x4c07b4f0, 0xda0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+/* [0x00002a68] */ 0x40073031, 0xda0109e3, // nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+/* [0x00002a70] */ 0x4d07a4f0, 0xdc0248a3, // sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+/* [0x00002a78] */ 0x40072031, 0xdc0109e3, // nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+/* [0x00002a80] */ 0x4c0794f0, 0xde0248a3, // add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+/* [0x00002a88] */ 0x4c071b71, 0xde0329e3, // add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+/* [0x00002a90] */ 0xffffff08, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002a98] */ 0x4d0854fe, 0x1a0248a1, // sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+/* [0x00002aa0] */ 0x550caffe, 0x1a024260, // mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
-+/* [0x00002aa8] */ 0x8f2c25f6, 0xd00242ca, // asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+/* [0x00002ab0] */ 0x4d08623e, 0x1c024860, // sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+/* [0x00002ab8] */ 0x4d08723e, 0x1e024860, // sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+/* [0x00002ac0] */ 0x4c208237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra8, rb8
-+/* [0x00002ac8] */ 0x4c0ca23e, 0x1c024860, // add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+/* [0x00002ad0] */ 0x4c2cb237, 0x10024860, // add r1, r1, r0 ; mul24 r0, ra11, rb11
-+/* [0x00002ad8] */ 0x0d127380, 0x10020867, // sub r1, r1, ra4
-+/* [0x00002ae0] */ 0x8d9cc23f, 0x10024862, // sub r1, r1, r0 ; mov r2, rb_wt_off
-+/* [0x00002ae8] */ 0x0f9c63c0, 0xd0020867, // asr r1, r1, 6
-+/* [0x00002af0] */ 0x4d591bce, 0x120269e0, // sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
-+/* [0x00002af8] */ 0x55653fce, 0x140453e1, // mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+/* [0x00002b00] */ 0x8d4e7076, 0x10029851, // sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
-+/* [0x00002b08] */ 0x8d692bf6, 0x1002b9d8, // sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
-+/* [0x00002b10] */ 0x8c9f8289, 0xd0024860, // add r1, r1, r2 ; mov r0, r1 << 8
-+/* [0x00002b18] */ 0x8c5e7236, 0x1c024863, // add r1, r1, r0 ; mov r3, ra_blk_height
-+/* [0x00002b20] */ 0xfffffe78, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002b28] */ 0x4f65039f, 0x18024862, // asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch
-+/* [0x00002b30] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00002b38] */ 0xf34003f3, 0xd2024c20, // max vpm, r1, 0 ; v8subs r0, ra_height, r3
-+/* [0x00002b40] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00002b48] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002b50] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00002b58] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00002b60] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002b68] */ 0xfffffe30, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002b70] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002b78] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002b80] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_filter_y10_b00
-+/* [0x00002b88] */ 0xfffff8e0, 0xf0f807a7, // brr ra_link, r:per_block_setup_10
-+/* [0x00002b90] */ 0x959a0ff6, 0x10024023, // mov ra0, unif ; mov r3, elem_num
-+/* [0x00002b98] */ 0xec9c3fd2, 0x100269e5, // add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2
-+/* [0x00002ba0] */ 0x8c001cff, 0x14024800, // add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+/* [0x00002ba8] */ 0x00000001, 0xe00208a7, // mov r2, 1
-+/* [0x00002bb0] */ 0x8c591eb6, 0x10025461, // add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0
-+/* [0x00002bb8] */ 0xf158fded, 0xd4025325, // shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5
-+/* [0x00002bc0] */ 0x809f8009, 0xd000d9d6, // nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+// :1
-+/* [0x00002bc8] */ 0x0d9d1bc0, 0xb00229e7, // sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
-+/* [0x00002bd0] */ 0x8e4c09f6, 0xa0029851, // shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+/* [0x00002bd8] */ 0x8e5509bf, 0x12024823, // shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+/* [0x00002be0] */ 0x13440dc0, 0xd40208a7, // max r2, ra_y, 0
-+/* [0x00002be8] */ 0x9269e5f6, 0x10029898, // min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+/* [0x00002bf0] */ 0x4c441dd3, 0xd4224462, // add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+/* [0x00002bf8] */ 0x8c613cbf, 0x10028e0f, // add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
-+/* [0x00002c00] */ 0x13440dc0, 0xd20208a7, // max r2, ra_y2, 0
-+/* [0x00002c08] */ 0x129de5c0, 0x100208a7, // min r2, r2, rb_max_y
-+/* [0x00002c10] */ 0x4c441dd3, 0xd2124462, // add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+/* [0x00002c18] */ 0x8c5cfe86, 0x12024f20, // add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax
-+/* [0x00002c20] */ 0x545983c6, 0x12024860, // and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
-+/* [0x00002c28] */ 0x4d492bce, 0x120269e1, // sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
-+/* [0x00002c30] */ 0xcc52706e, 0x1a024865, // add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
-+/* [0x00002c38] */ 0x915c83f6, 0xdc024863, // shl r1, r1, 8 ; mov r3, ra_blk_height
-+/* [0x00002c40] */ 0xec40c3f3, 0x12024860, // add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+/* [0x00002c48] */ 0xffffff60, 0xf06809e7, // brr.anyn -, r:1b
-+/* [0x00002c50] */ 0x0f9d03c0, 0xd0020867, // asr r1, r1, (DENOM + 9) - 32
-+/* [0x00002c58] */ 0x925f23bf, 0x12020867, // min r1, r1, ra_pmax ; mov -, vw_wait
-+/* [0x00002c60] */ 0x5351039f, 0x18024c22, // max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+/* [0x00002c68] */ 0x956e7036, 0x10126431, // mov.setf ra_height, r0 ; mov vw_setup, ra_dma0
-+/* [0x00002c70] */ 0x00000000, 0xf027c9e7, // bra.anyz -, ra_link
-+/* [0x00002c78] */ 0x929dd0ff, 0x10024831, // min r0, r0, r3 ; mov vw_setup, rb_dma1
-+/* [0x00002c80] */ 0x8d7270f6, 0x10024872, // sub r1, r0, r3 ; mov vw_addr, ra_dest
-+/* [0x00002c88] */ 0x119d73c0, 0xd0020867, // shl r1, r1, i_shift23
-+/* [0x00002c90] */ 0xffffff18, 0xf0f809e7, // brr -, r:1b
-+/* [0x00002c98] */ 0x0c9d2e00, 0x100214a7, // add rb_lcount, rb_lcount, r0
-+/* [0x00002ca0] */ 0x0c6e7c40, 0x100206e7, // add ra_dma0, ra_dma0, r1
-+/* [0x00002ca8] */ 0x8c71ccbf, 0x10024731, // add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init
-+// ::mc_end
-+};
-+#ifdef __HIGHC__
-+#pragma Align_to(8, ff_hevc_rpi_shader)
-+#endif
-diff --git a/libavcodec/rpi_hevc_shader.h b/libavcodec/rpi_hevc_shader.h
-new file mode 100644
-index 0000000000..79651c9b6c
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.h
-@@ -0,0 +1,63 @@
-+#ifndef rpi_hevc_shader_H
-+#define rpi_hevc_shader_H
-+
-+extern unsigned int ff_hevc_rpi_shader[];
-+
-+#define mc_setup_c_q0 (ff_hevc_rpi_shader + 0)
-+#define mc_start (ff_hevc_rpi_shader + 0)
-+#define mc_setup_c_qn (ff_hevc_rpi_shader + 2)
-+#define mc_filter_c_p (ff_hevc_rpi_shader + 134)
-+#define mc_filter_c_p_l1 (ff_hevc_rpi_shader + 260)
-+#define mc_filter_c_b (ff_hevc_rpi_shader + 386)
-+#define mc_sync_q0 (ff_hevc_rpi_shader + 580)
-+#define mc_sync_q1 (ff_hevc_rpi_shader + 598)
-+#define mc_sync_q2 (ff_hevc_rpi_shader + 610)
-+#define mc_sync_q3 (ff_hevc_rpi_shader + 622)
-+#define mc_sync_q4 (ff_hevc_rpi_shader + 634)
-+#define mc_sync_q5 (ff_hevc_rpi_shader + 652)
-+#define mc_sync_q6 (ff_hevc_rpi_shader + 664)
-+#define mc_sync_q7 (ff_hevc_rpi_shader + 676)
-+#define mc_sync_q8 (ff_hevc_rpi_shader + 688)
-+#define mc_sync_q9 (ff_hevc_rpi_shader + 706)
-+#define mc_sync_q10 (ff_hevc_rpi_shader + 718)
-+#define mc_sync_q11 (ff_hevc_rpi_shader + 730)
-+#define mc_exit_c_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_y_qn (ff_hevc_rpi_shader + 742)
-+#define mc_exit_c_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_exit_y_q0 (ff_hevc_rpi_shader + 760)
-+#define mc_setup_y_q0 (ff_hevc_rpi_shader + 780)
-+#define mc_setup_y_qn (ff_hevc_rpi_shader + 782)
-+#define mc_filter_y_pxx (ff_hevc_rpi_shader + 1014)
-+#define mc_filter_y_bxx (ff_hevc_rpi_shader + 1140)
-+#define mc_filter_y_p00 (ff_hevc_rpi_shader + 1272)
-+#define mc_filter_y_b00 (ff_hevc_rpi_shader + 1358)
-+#define mc_setup_c10_q0 (ff_hevc_rpi_shader + 1432)
-+#define mc_setup_c10_qn (ff_hevc_rpi_shader + 1434)
-+#define mc_filter_c10_p (ff_hevc_rpi_shader + 1562)
-+#define mc_filter_c10_p_l1 (ff_hevc_rpi_shader + 1684)
-+#define mc_filter_c10_b (ff_hevc_rpi_shader + 1806)
-+#define mc_sync10_q0 (ff_hevc_rpi_shader + 1996)
-+#define mc_sync10_q1 (ff_hevc_rpi_shader + 2014)
-+#define mc_sync10_q2 (ff_hevc_rpi_shader + 2026)
-+#define mc_sync10_q3 (ff_hevc_rpi_shader + 2038)
-+#define mc_sync10_q4 (ff_hevc_rpi_shader + 2050)
-+#define mc_sync10_q5 (ff_hevc_rpi_shader + 2068)
-+#define mc_sync10_q6 (ff_hevc_rpi_shader + 2080)
-+#define mc_sync10_q7 (ff_hevc_rpi_shader + 2092)
-+#define mc_sync10_q8 (ff_hevc_rpi_shader + 2104)
-+#define mc_sync10_q9 (ff_hevc_rpi_shader + 2122)
-+#define mc_sync10_q10 (ff_hevc_rpi_shader + 2134)
-+#define mc_sync10_q11 (ff_hevc_rpi_shader + 2146)
-+#define mc_exit_c10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_y10_q0 (ff_hevc_rpi_shader + 2158)
-+#define mc_exit_c10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_exit_y10_qn (ff_hevc_rpi_shader + 2178)
-+#define mc_setup_y10_q0 (ff_hevc_rpi_shader + 2196)
-+#define mc_setup_y10_qn (ff_hevc_rpi_shader + 2198)
-+#define mc_filter_y10_pxx (ff_hevc_rpi_shader + 2440)
-+#define mc_filter_y10_p00 (ff_hevc_rpi_shader + 2566)
-+#define mc_filter_y10_bxx (ff_hevc_rpi_shader + 2654)
-+#define mc_filter_y10_b00 (ff_hevc_rpi_shader + 2786)
-+#define mc_end (ff_hevc_rpi_shader + 2860)
-+
-+#endif
-diff --git a/libavcodec/rpi_hevc_shader.qasm b/libavcodec/rpi_hevc_shader.qasm
-new file mode 100644
-index 0000000000..77946a0443
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader.qasm
-@@ -0,0 +1,1821 @@
-+# Inter pred asm
-+#
-+# Logic here should be good to 14 bits without modification
-+# but only 8 & 10 are currently instantiated & tested
-+# 15 & 16 bits have different shift1, shift2 calc & I also suspect overflow
-+# in _p00 & _b00
-+
-+# The @ "mul_used", 0 annotations that occur by various mul blocks suppress
-+# the warning that we are using rotation & ra/rb registers. r0..3 can be
-+# rotated through all 16 elems ra regs can only be rotated through their
-+# local 4. As it happens this is what is wanted here as we do not want the
-+# constants from the other half of the calc.
-+
-+# Number limits in P/B calculation
-+#
-+# In order to avoid issues with mul24 being an unsigned 24->32 bit multiplier
-+# we offset our intermediates s.t. they always end up +ve before the next
-+# multiply (may be -ve whilst summing but that doesn't matter).
-+#
-+# Range calc for up to 14 bits (Y-B pred):
-+#
-+# denom: [0, 7]
-+# bmax = (1 << bits) - 1
-+# off: [-(1 << (bits-1)), (1 << (bits-1)) - 1]
-+#
-+# wt_mul: [-128, 255]
-+# wt_off = off * 2 + 1: [-bmax, bmax]
-+#
-+# pel: [0, bmax]
-+# H-filter: [(-22*pel + 88*pel) >> (bits-8) + 0x4000] = [0x2a00, 0x97ff]
-+# V-filter: [(-22*hf + 88*hf) >> 6] = [0x580, 0xc28e]
-+# mul_t = (V_L0 + V_l1) * (wt_mul + 128): [0, 0x24624e6]
-+# mul_t - (V_l0 + V_l1)* 128: [-0xc28e00, 0x18396e4]
-+# adj_wt_off = (wt_off << ((denom + 6) - (bits - 8))) - 0x4000 * (wt_mul * 2):
-+# [wt_off << (21 - bits)] - [wt_mul << 15] = [-0x1fffff, 0x1fffff] - [-0x400000, 0x7f8000]
-+#
-+# This all looks good and is mostly bit depth independant - and as we manage
-+# to do unsigned multiplies everywhere (now) this should be good for any bit
-+# depth up to 14 (we could probably do 16 - but that requires a few tweaks
-+# to the shifts we don't currently have logic for)
-+
-+# PREREAD is the number of requests that we have sitting in the TMU request
-+# queue.
-+#
-+# There are 8 slots availible in the TMU request Q for tm0s requests, but
-+# only 4 output FIFO entries and overflow is bad (corruption or crash)
-+# (If threaded then only 2 out FIFO entries, but we aren't.)
-+# In s/w we are effectively limited to the min vertical read which is >= 4
-+# so output FIFO is the limit.
-+#
-+# As the test for read-next is is the main part of the Luma loop (rather than
-+# the preload FIFO part) we are limited to min_luma_height - 1
-+# Min_luma_height is 4 so we can only have a preload of 3
-+# Beware that min_chroma_height (and_width) is 2 so we can't do the same trick
-+# in chroma without abandoning preload pretty much entirely (which would be bad)
-+#
-+# Timing tests vs preload of 4 suggests this doesn't hurt us much
-+# Could have preread 4 for Chroma but when tested it didn't help
-+
-+.set PREREAD, 3
-+
-+# Offset added (effectively) at the exit of the H FIR filter
-+# This is enough to force the result +ve
-+# Is good if it is a power of 2 as that allows for >> without loss
-+#
-+# Worst case for a single Y FIR is *-22 so we need an offset of 256*22
-+# But we need twice offset to survive both H & V = 256*22*2 = 0x2c00
-+# Round up to next power of 2
-+
-+.set FIR_OFFSET, 0x4000
-+
-+# Block heights - 8 & 16 are the only numbers we currently support
-+
-+.set C_BLK_HEIGHT_8, 16
-+.set C_BLK_HEIGHT_16, 8
-+.set Y_BLK_HEIGHT_8, 16
-+.set Y_BLK_HEIGHT_16, 8
-+
-+# QPU counts - depend on block size
-+# If we have a 2-byte format & block_size > 8 then can only afford
-+# 8 QPUs
-+# These numbers must match the numbers in ff_hevc_rpi_shader_cmd.h
-+
-+.set N_QPU_8, 12
-+.set N_QPU_16, 12
-+
-+# Value to add to the weight multiplier to convert it into an unsigned value
-+# Should be power of two for convienience
-+
-+.set LOG2_MUL_ADD, 14
-+.set MUL_ADD, (1 << LOG2_MUL_ADD)
-+
-+# Fixed denom (max that it can be set to)
-+.set DENOM, 7
-+
-+# register allocation
-+#
-+
-+# ra0-3
-+# Used as temp and may be loop filter coeffs (split into .8s)
-+# or temp in loop. Check usage on an individual basis.
-+
-+# ra4-11
-+# V FIFO / temp / free
-+
-+# -- free -- ra12
-+
-+# -- free -- ra13
-+
-+# -- free -- ra14
-+
-+# -- free -- ra15
-+
-+# uniform: width:height
-+.set ra_width_height, ra16
-+.set ra_width, ra16.16b
-+.set ra_height, ra16.16a
-+
-+# y:y2 same layout as y_y2_next so we can update both together
-+.set ra_y_y2, ra17
-+.set ra_y2, ra17.16a
-+.set ra_y, ra17.16b
-+
-+# uniform: L1 weight (U on left, V on right)
-+# Only used in Y B
-+.set ra_wt_off_mul_l1, ra18
-+.set ra_wt_off_l1, ra18.16b
-+.set ra_wt_mul_l1, ra18.16a
-+
-+# y_next:y2_next same layout as y_y2 so we can update both together
-+.set ra_y_y2_next, ra19
-+.set ra_y_next, ra19.16b
-+.set ra_y2_next, ra19.16a
-+
-+# Setup: consts - subdivide a single register
-+.set ra_kff800100, ra20
-+.set ra_k256, ra20.16a
-+.set ra_k0, ra20.8a
-+.set ra_k1, ra20.8b
-+.set ra_k128, ra20.8c
-+.set ra_k255, ra20.8d
-+
-+# Loop: xshifts
-+.set ra_xshift, ra21.16a
-+.set ra_xshift_next, ra21.16b
-+
-+# Loop var: L0 weight (U on left, V on right)
-+# _off_ is not used in loop as we want to modify it before use
-+.set ra_wt_off_mul_l0, ra22
-+.set ra_wt_mul_l0, ra22.16a
-+.set ra_wt_off_l0, ra22.16b
-+
-+# Max pel value (for 8 bit we can get away with sat ops but not 9+)
-+# * Could merge with rb_pmask. For 10 bit Logically pmask needs 0xff in the
-+# 2nd byte but as the source should never be > 3 there 0x3ff should do
-+.set ra_blk_height_pmax, ra23
-+.set ra_pmax, ra23.16a
-+.set ra_blk_height, ra23.8c
-+# --free -- ra23.8d
-+
-+# Loop: src frame base (L0)
-+.set ra_base, ra24
-+
-+# Misc offsets
-+.set ra_fir_off_val_wt_den_p7, ra25
-+.set ra_wt_den_p7, ra25.8a
-+# -- free -- ra25.8b
-+.set ra_fir_off_val, ra25.16b
-+
-+# As it happens these constants are the same
-+.if FIR_OFFSET == MUL_ADD
-+# Weight multiplier unsigned add
-+.set ra_kmul_add, ra_fir_off_val
-+.else
-+.error "FIR_OFFSET != MUL_ADD: Need new register & init"
-+.endif
-+
-+# Loop: next src frame base (L0)
-+.set ra_base_next, ra26
-+
-+# Loop: height<<23 + width<<16 + vdw_setup_0
-+.set ra_dma0, ra27
-+
-+# Loop: destination address
-+.set ra_dest, ra28
-+
-+# Setup: Dup of rb_ef
-+# Lo bits are used as Y coeff 0 as that lefts us combine test & coeff mul
-+# (top bits are ignored by mul24)
-+.set ra_ef, ra29
-+
-+# Use an even numbered register as a link register to avoid corrupting flags
-+.set ra_link, ra30
-+
-+# -- free -- ra31
-+
-+.set rb_xshift2, rb0
-+.set rb_xshift2_next, rb1
-+
-+# C: (elem & 1) == 0 ? elem * 2 : (elem + 4) * 2
-+.set rb_elem_x, rb2
-+
-+# El Flags
-+# After adding to self we to have el even/odd on nc/c and lo/hi on nn/n
-+# Duped into ra_ef as sometimes that is easier to use
-+.set rb_ef, rb3
-+
-+# rb4-11
-+# Loop: V filter FIFO or V filter coeff
-+
-+# Loop var: offset to add before shift (round + weighting offsets)
-+# Exact value varies by loop
-+.set rb_wt_off, rb12
-+
-+# -- free -- rb13
-+
-+# -- free -- rb14
-+
-+# Loop: src frame base (L1)
-+.set rb_base2, rb15
-+
-+# Line pitch (128 for sand128)
-+.set rb_pitch, rb16
-+
-+# Loop count - 2 (set up TMU for next xfer)
-+.set rb_i_tmu, rb17
-+
-+# Loop count for min(height, 16)
-+# Y will reset & loop again if height > 16
-+.set rb_lcount, rb18
-+
-+# frame_base2_next
-+.set rb_base2_next, rb19
-+
-+# Setup: Height of Y+C in sand, (x&mask)*xpitch will give
-+# offset to the slice
-+.set rb_xpitch, rb20
-+
-+# These 3 consts each save 1 instruction in Y loop setup
-+# so whilst they are worthwhile they should be the 1st to die if we need
-+# another b reg
-+.set rb_y_coeffs_2, rb21 # 0x050b0a00
-+.set rb_y_coeffs_3, rb22 # 0x11283a40
-+.set rb_y_coeffs_5, rb23 # 0x0a0b0500
-+
-+# Setup: 0xff (8-bit) / 0xffff (9+ bit)
-+.set rb_pmask, rb24
-+
-+# vdw_setup_1(dst_pitch)
-+.set rb_dma1_base, rb25
-+
-+# Setup: pic width - 1
-+# In bytes so 8 bit luma is (width - 1)*1, 16 bit chroma is (width -1)*4 etc.
-+.set rb_max_x, rb26
-+
-+# vdw_setup_0 (depends on QPU number)
-+.set rb_dma0_base, rb27
-+
-+# Setup: vw_setup value to reset VPM write pointer
-+.set rb_vpm_init, rb28
-+
-+# Loop: vdw_setup_1(dst_pitch-width) = stride
-+.set rb_dma1, rb29
-+
-+# Setup: pic_height - 1
-+.set rb_max_y, rb30
-+
-+# Setup: FIR H offset
-+.set rb_fir_off_h, rb31
-+
-+
-+# With shifts only the bottom 5 bits are considered so -16=16, -15=17 etc.
-+.set i_shift16, -16
-+.set i_shift21, -11
-+.set i_shift23, -9
-+.set i_shift30, -2
-+
-+# Much of the setup code is common between Y & C
-+# Macros that express this - obviously these can't be overlapped
-+# so are probably unsuitable for loop code
-+
-+.macro m_calc_dma_regs, v_bit_depth, v_blk_height, r_vpm, r_dma
-+ mov r2, qpu_num
-+.if v_bit_depth <= 8
-+ # 8 bit version
-+ asr r1, r2, 2
-+ shl r1, r1, 6
-+ and r0, r2, 3
-+ or r0, r0, r1
-+
-+ mov r1, vpm_setup(0, 4, h8p(0, 0)) # 4 is stride - stride acts on ADDR which is Y[5:0],B[1:0] for 8 bit
-+ add r_vpm, r0, r1 # VPM 8bit storage
-+
-+ mov r1, vdw_setup_0(0, 0, dma_h8p(0,0,0)) # height,width added later
-+ shl r0, r0, 5
-+
-+.else
-+ # 16 bit version
-+ # Limited to 8 QPUs if blk height > 8
-+ asr r1, r2, 1
-+.if v_blk_height <= 8
-+ shl r1, r1, 4
-+.else
-+ shl r1, r1, 5
-+.endif
-+ and r0, r2, 1
-+ or r0, r0, r1
-+
-+ mov r1, vpm_setup(0, 2, h16p(0, 0)) # 2 is stride - stride acts on ADDR
-+ add r_vpm, r0, r1
-+
-+ # X = H * 8 so the YH from VPMVCD_WR_SETUP[ADDR] drops into
-+ # XY VPMVCD_WR_SETUP[VPMBASE] if shifted left 3 (+ 3 for pos of field in reg)
-+ mov r1, vdw_setup_0(0, 0, dma_h16p(0,0,0)) # height,width added later
-+ shl r0, r0, 6
-+.endif
-+ add r_dma, r0, r1 # DMA out
-+.endm
-+
-+
-+.macro m_setup_q0
-+ srel -, 12
-+.endm
-+
-+# Code start label
-+::mc_start
-+
-+################################################################################
-+# mc_setup_c
-+#
-+# typedef struct qpu_mc_pred_c_s_s {
-+# int16_t y;
-+# int16_t x;
-+# uint32_t base;
-+# uint32_t pic_cw; // C Width (== Y width / 2)
-+# uint32_t pic_ch; // C Height (== Y Height / 2)
-+# uint32_t stride2;
-+# uint32_t stride1;
-+# uint32_t wdenom;
-+# int16_t y2;
-+# int16_t x2;
-+# uint32_t base2;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_c_s_t;
-+
-+.macro m_setup_c, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift, 1
-+.set v_pmask, 0xff
-+.set v_blk_height, C_BLK_HEIGHT_8
-+.else
-+.set v_x_shift, 2
-+.set v_pmask, 0xffff
-+.set v_blk_height, C_BLK_HEIGHT_16
-+.endif
-+
-+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
-+
-+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+ shl rb_ef, r0, i_shift30 ; mov ra_base, unif # ; ref_c_base
-+
-+# Read image dimensions
-+ sub r0, unif, 1 # pic c width
-+ shl rb_max_x, r0, v_x_shift # rb_max_x in bytes
-+ sub rb_max_y, unif, 1 # pic c height
-+
-+# load constants
-+ mov ra_kff800100, 0xff800100
-+ mov rb_pmask, v_pmask
-+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+
-+# get source pitch
-+ mov ra_ef, rb_ef ; mov rb_xpitch, unif # ; stride2
-+ mov rb_pitch, unif # stride1
-+ mov r1, vdw_setup_1(0) # [rb_pitch delay] Merged with dst_stride shortly
-+ add rb_dma1_base, r1, rb_pitch # vdw_setup_1
-+
-+ and r0, 1, elem_num
-+ nop ; mul24 r0, r0, 5
-+.if v_bit_depth <= 8
-+ add rb_elem_x, r0, elem_num
-+.else
-+ add r0, r0, elem_num
-+ add rb_elem_x, r0, r0
-+.endif
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# ra_base2 ends up with t1s base
-+
-+ shl r0, ra0.16b, v_x_shift # [rb_elem_x delay]
-+ add r0, r0, rb_elem_x # Add elem no to x to get X for this slice
-+ max r0, r0, 0 ; mov ra_y, ra0.16a # ; stash Y
-+ min r0, r0, rb_max_x
-+
-+# Get shift
-+# Shift will always calculate as 0 for 9+ bit
-+# Ideally we can optimize the shift out of the code in these cases but for now
-+# it is tidier to leave it in
-+.if v_bit_depth <= 8
-+ shl ra_xshift_next, r0, 3
-+.else
-+ mov ra_xshift_next, 0 ; mov rb_xshift2_next, 0
-+.endif
-+
-+# In a single 32 bit word we get 1 or 2 UV pairs so mask bottom bits of xs if we need to
-+
-+.if v_bit_depth <= 8
-+ and r0, r0, -4
-+.endif
-+ sub r1, ra_k0, rb_pitch
-+ and r1, r0, r1
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra0, unif # ; next_x2_y2
-+ add ra_base, ra_base, r0
-+
-+# Compute part of VPM to use for DMA output
-+# * We only get 8 QPUs if 16 bit - maybe reduce height and auto-loop?
-+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+# And again for L1, but only worrying about frame2 stuff
-+
-+# Compute base address for first and second access
-+# ra_base ends up with t0s base
-+# rb_base2 ends up with t1s base
-+
-+ shl r0, ra0.16b, v_x_shift
-+ add r0, r0, rb_elem_x ; mov ra_y2, ra0.16a # Add QPU slice offset
-+ max r0, r0, 0 ; mov rb_base2, unif # ref_c_base2
-+ min r0, r0, rb_max_x
-+
-+# Get shift (already zero if 9+ bit so ignore)
-+.if v_bit_depth <= 8
-+ shl rb_xshift2_next, r0, 3
-+.endif
-+
-+# In a single 32 bit word we get 2 UV pairs so mask bottom bit of xs
-+
-+.if v_bit_depth <= 8
-+ and r0, r0, -4
-+.endif
-+ sub r1, ra_k0, rb_pitch
-+ and r1, r0, r1 ; mov r3, PREREAD
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov r2, ra_y2
-+ add rb_base2, rb_base2, r0 ; mov r0, ra_y
-+
-+# Do preloads
-+# r0 = ra_y, r2 = ra_y2, r3 = PREREAD
-+
-+:1
-+ sub.setf r3, r3, 1
-+ max r1, r0, 0
-+ min r1, r1, rb_max_y
-+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t0s, ra_base, r1 ; mov ra_y, r0
-+
-+ max r1, r2, 0
-+ brr.anynz -, r:1b
-+ min r1, r1, rb_max_y
-+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t1s, rb_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+ mov ra_link, unif # link
-+# touch registers to keep simulator happy (and fills in delay slots)
-+ mov ra4, 0 ; mov rb4, 0
-+ bra -, ra_link
-+ mov ra5, 0 ; mov rb5, 0
-+ mov ra6, 0 ; mov rb6, 0
-+ mov ra7, 0 ; mov rb7, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_c_q0
-+ m_setup_q0
-+::mc_setup_c_qn
-+ m_setup_c 8
-+
-+################################################################################
-+#
-+# mc_filter_c_p
-+#
-+# typedef struct qpu_mc_pred_c_p_s {
-+# int16_t y;
-+# int16_t x;
-+# uint32_t base;
-+# uint16_t h;
-+# uint16_t w;
-+# uint32_t coeffs_x;
-+# uint32_t coeffs_y;
-+# uint32_t wo_u;
-+# uint32_t wo_v;
-+# uint32_t dst_addr_c;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_c_p_t;
-+
-+.macro m_filter_c_p, v_tmu, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 1
-+.set v_x_mul, 2
-+.set v_v_shift, 8
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 2
-+.set v_x_mul, 4
-+.set v_v_shift, i_shift16
-+# Shifts to get width & height in the right place in rb_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
-+
-+.if v_tmu == 0
-+.set vrx_xshift, rb_xshift2 # b side more convienient
-+.set vrx_xshift_next, ra_xshift_next
-+.set vra_y_next, ra_y_next
-+.set vrx_base_next, ra_base_next
-+.set vra_y, ra_y
-+.set vra_base, ra_base
-+.set vr_txs, t0s
-+.else
-+.set vrx_xshift, ra_xshift # a side more convienient
-+.set vrx_xshift_next, rb_xshift2_next
-+.set vra_y_next, ra_y2_next
-+.set vrx_base_next, rb_base2_next
-+.set vra_y, ra_y2
-+.set vra_base, rb_base2
-+.set vr_txs, t1s
-+.endif
-+
-+# denom shift values
-+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+# get base addresses and per-channel shifts for *next* invocation
-+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
-+
-+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; base
-+
-+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r0, r0 # r5 = 0
-+ add r0, r0, rb_elem_x ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
-+ sub r1, r5, rb_pitch ; mov ra0, unif # ; H filter coeffs
-+ max r0, r0, r5 ; mov vrx_xshift, vrx_xshift_next
-+ min r0, r0, rb_max_x ; mov vra_y_next, ra2.16a
-+
-+.if v_bit_depth <= 8
-+ shl vrx_xshift_next, r0, 3
-+ and r0, r0, -4
-+.endif
-+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=w*2 (we are working in pel pairs) ** x*2 already calced!
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra3, unif # ; V filter coeffs
-+ add vrx_base_next, r3, r0 ; mov r1, ra_height
-+
-+# set up VPM write
-+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U offset/weight
-+ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_off_mul_l0, unif # ; V offset/weight
-+
-+# Misc final setup...
-+
-+ shl r0, r1, v_dma_h_shift ; mov ra_dest, unif # ; dst_addr
-+ add r0, r0, r2 ; mov r2, ra_fir_off_val # Combine width and height of destination area (r0=h<<8, r2=w*2)
-+ shl r0, r0, v_dma_wh_shift ; mov rb10, ra3.8c # Shift into bits 16 upwards of the vdw_setup0 register
-+ add ra_dma0, r0, rb_dma0_base ; mov r1, ra_wt_off_l0 # ; r1=weight
-+ shl r1, r1, i_wt_den_p5 ; mul24 r0, r2, ra_wt_mul_l0
-+ sub rb_wt_off, r1, r0 ; mov r0, ra_kmul_add
-+ add ra_wt_mul_l0, ra_wt_mul_l0, r0 ; mov r5rep, -4 # ; loop counter (V FIFO fill = 4)
-+ mov rb11, ra3.8d ; mov ra_link, unif # ; Link
-+
-+# r5 = -4 (loop counter)
-+# ra_wt_mul_l0 = weight L0 + 128 (now unsigned)
-+# rb_wt_off = (offset * 2 + 1) << (wt_den + 5)
-+# rb31 = FIR value offset
-+
-+# FIFO: rb4, ra5, rb6, ra7
-+# Coeffs in ra3.8a, ra3.8b, rb10, rb11
-+
-+# We want (r0r1)
-+# U0U3 : V0V3 : U1U4 : V1V4 : U2U5 : V2U5 : ...
-+# We fetch (after shift)
-+# C0 : C3 : C1 : C4 : C2 : C5 : ...
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+.if v_tmu == 0
-+ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu0
-+ shr r2, r4, vrx_xshift ; mov.ifz r3, vra_y_next
-+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+ add.setf -, rb_ef, rb_ef ; mov.ifz vra_base, vrx_base_next
-+.else
-+ sub.setf -, r5, rb_i_tmu ; mov rb4, ra5 ; ldtmu1
-+ shr r2, r4, vrx_xshift ; mov.ifz vra_base, vrx_base_next
-+ shr r1, r2, v_v_shift ; mov.ifnz r3, vra_y
-+ add.setf -, rb_ef, rb_ef ; mov.ifz r3, vra_y_next # [r1 << delay]
-+.endif
-+
-+ add vra_y, r3, ra_k1 ; mov r0, r1 << 15
-+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+ min r3, r3, rb_max_y ; mov.ifnc r0, r2
-+
-+ and r1, r1, ra_pmax ; mul24 r3, r3, rb_pitch
-+.if v_tmu == 0
-+ add vr_txs, vra_base, r3 ; v8min r0, r0, rb_pmask # ; mask bytes
-+.else
-+ add vr_txs, vra_base, r3 ; v8min r0, r0, ra_pmax # ; mask bytes
-+.endif
-+
-+# apply horizontal filter
-+# The filter coeffs for the two halves of this are the same (unlike in the
-+# Y case) so it doesn't matter which ra0 we get them from
-+# Also as the two halves are locked together we don't need to separate the 1st
-+# r0 mul or the last r1 mul as they are valid for all QPUs
-+
-+ add r5rep, r5, 1 ; mul24 r3, ra0.8a, r0
-+ sub r2, rb_fir_off_h, r3 ; mul24 r3, ra0.8d, r1
-+ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+ add.setf -, r5, r5 ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+# V filter = - r4 * a + r5 * b + r6 * c - r7 * d (post FIFO shift)
-+# We would like to save the r5->r4 shift but we need a delay slot
-+# for both r7 & r6 which we can't find anything to put in if we have
-+# already multiplied r4 & r5!
-+ brr.anyn -, r:1b
-+ add r2, r2, r3 ; mul24 r0, ra7, rb10 # r6 post
-+ mov ra5, rb6 ; mul24 r1, rb6, ra3.8b # r5 post
-+ asr ra7, r2, v_bit_depth - 8 ; mov rb6, ra7
-+# >>> .anyn 1b
-+
-+ add r1, r1, r0 ; mul24 r0, rb4, ra3.8a # [ra7 delay]
-+ sub r1, r1, r0 ; mul24 r0, ra7, rb11
-+ sub r1, r1, r0
-+
-+ asr r1, r1, 6 ; mov r3, ra_blk_height # ; NxtLoop
-+ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop
-+ brr.anyn -, r:1b
-+ asr r1, r1, i_wt_den_p6
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ brr -, r:1b
-+ add rb_lcount, rb_lcount, r0
-+ add ra_dma0, ra_dma0, r1
-+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_p
-+ m_filter_c_p 0, 8
-+
-+::mc_filter_c_p_l1
-+ m_filter_c_p 1, 8
-+
-+################################################################################
-+#
-+# mc_filter_c_b
-+#
-+# typedef struct qpu_mc_pred_c_b_s {
-+# int16_t y;
-+# int16_t x;
-+# uint32_t base;
-+# uint16_t h;
-+# uint16_t w;
-+# uint32_t coeffs_x1;
-+# uint32_t coeffs_y1;
-+# int16_t weight_u1;
-+# int16_t weight_v1;
-+# int16_t y2;
-+# int16_t x2;
-+# uint32_t base2;
-+# uint32_t coeffs_x2;
-+# uint32_t coeffs_y2;
-+# uint32_t wo_u2;
-+# uint32_t wo_v2;
-+# uint32_t dst_addr_c;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_c_b_t;
-+
-+.macro m_filter_c_b, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 1
-+.set v_v_shift, 8
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 2
-+.set v_v_shift, i_shift16
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
-+.set v_x_mul, (1 << v_x_shift)
-+
-+# denom shift values
-+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
-+
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+ mov vw_setup, rb_vpm_init ; mov ra2, unif # ; x_y
-+
-+ add.setf -, rb_ef, rb_ef ; mov r3, unif # [ra2 delay] ; r3=base
-+
-+ shl r0, ra2.16b, v_x_shift ; v8subs r5rep, r1, r1 # x ; r5=0
-+ add r0, r0, rb_elem_x ; mov ra_y_next, ra2.16a
-+ sub r1, r5, rb_pitch ; mov ra_width_height, unif # r1=pitch2 mask ; width_height
-+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+ min r0, r0, rb_max_x ; mov ra0, unif # ; L0 H filter coeffs
-+
-+.if v_bit_depth <= 8
-+ shl ra_xshift_next, r0, 3
-+.endif
-+
-+ and r0, r0, -4 ; mov ra2, unif # ; L0 V filter coeffs
-+ and r1, r0, r1 ; mul24 r2, ra_width, v_x_mul # r2=x*2 (we are working in pel pairs)
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov r1, ra_height # Add stripe offsets ; r1=height
-+ add ra_base_next, r3, r0 ; mov rb_xshift2, rb_xshift2_next # ; xshift2 used because B
-+
-+# set up VPM write
-+
-+ sub rb_dma1, rb_dma1_base, r2 ; mov ra_wt_off_mul_l0, unif # Compute vdw_setup1(dst_pitch-width) ; U weight
-+ add rb_i_tmu, r1, (3-4) - PREREAD ; v8min r1, r1, ra_blk_height
-+ add rb_lcount, r1, (3-4) ; mov.ifc ra_wt_mul_l0, ra_wt_off_l0 # ; V weight
-+
-+ shl r0, r1, v_dma_h_shift ; mov ra3, unif # ; x2_y2
-+ add r0, r0, r2 ; mov r3, unif # [ra3 delay] ; base
-+ shl r0, r0, v_dma_wh_shift ; mov ra_y2_next, ra3.16a # Shift into bits 16 upwards of the vdw_setup0 register
-+ add ra_dma0, r0, rb_dma0_base ; mov r0, ra3.16b # r0=x
-+
-+# L1 - uniform layout could possibly be optimized
-+
-+ shl r0, r0, v_x_shift ; mov ra1, unif # r0=x<<shift ; L1 H filter coeffs
-+ add r0, r0, rb_elem_x ; mov ra3, unif # ; L1 V filter coeffs
-+ sub r1, r5, rb_pitch ; mov ra_wt_off_mul_l1, unif # [ra3 delay] r1=pitch2 mask ; U offset/weight
-+ max r0, r0, r5 ; mov ra9, rb_max_y
-+ min r0, r0, rb_max_x ; mov r2, ra_kmul_add
-+
-+.if v_bit_depth <= 8
-+ shl rb_xshift2_next, r0, 3
-+.endif
-+
-+ and r0, r0, -4 ; mov.ifc ra_wt_off_mul_l1, unif # ; V offset/weight
-+ and r1, r0, r1 ; mov r5rep, -4
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dst_addr
-+ add rb_base2_next, r3, r0 ; mov r0, ra_fir_off_val
-+
-+ add ra_wt_mul_l0, ra_wt_mul_l0, r2 ; mul24 r1, r0, ra_wt_mul_l0
-+ add ra_wt_mul_l1, ra_wt_mul_l1, r2 ; mul24 r0, r0, ra_wt_mul_l1
-+ add r0, r0, r1 ; mov r1, ra_wt_off_l1 # ; L0 off unset
-+ shl r1, r1, i_wt_den_p6 ; mov rb11, ra3.8d
-+ sub rb_wt_off, r1, r0 ; mov ra_link, unif # ; link
-+
-+ mov ra10, rb_xshift2 ; mov rb7, ra2.8d
-+
-+# r5 loop counter (-4)
-+# ra0 H coeffs L0
-+# ra1 H coeffs L1
-+# ra2 V coeffs L0
-+# ra3 V coeffs L1
-+# ra9 rb_max_y alias
-+# ra10 rb_xshift2 alias
-+
-+:1
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu0
-+ shr r2, r4, ra_xshift ; mov.ifz rb_base2, rb_base2_next
-+ shr r1, r2, v_v_shift ; mov.ifz ra_y_y2, ra_y_y2_next
-+ add.setf -, rb_ef, rb_ef ; mov.ifz ra_base, ra_base_next # [ra_y delay]
-+ add ra_y, 1, ra_y ; mov r3, ra_y
-+
-+ max r3, r3, ra_k0 ; mov r0, r1 << 15
-+ min r3, r3, ra9 ; mov.ifnc r1, r2 << 1
-+
-+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+ add t0s, ra_base, r3 ; v8min r0, r0, rb_pmask # ; masks bytes
-+
-+# L0 H-filter (-ra4*, +rb5, +rb6, -ra7)
-+
-+ and r1, r1, rb_pmask ; mul24 r2, ra0.8a, r0
-+ sub r2, rb_fir_off_h, r2 ; mul24 r3, ra0.8d, r1
-+ sub r2, r2, r3 ; mul24 r3, ra0.8b << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8b << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra0.8c << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+ add r0, r2, r3 ; mul24 ra4, rb5, ra2.8a ; ldtmu1
-+
-+ shr r2, r4, ra10 ; mov rb5, rb6
-+ shr r1, r2, v_v_shift ; mov r3, ra_y2
-+ shr ra7, r0, v_bit_depth - 8 ; mov rb6, ra7 # [r1 << delay]
-+
-+ add ra_y2, r3, ra_k1 ; mov r0, r1 << 15
-+ max r3, r3, ra_k0 ; mov.ifnc r1, r2 << 1
-+ min r3, r3, rb_max_y ; v8min r1, r1, ra_pmax
-+
-+ mov.ifnc r0, r2 ; mul24 r3, r3, rb_pitch
-+ add t1s, rb_base2, r3 ; v8min r0, r0, ra_pmax # ; masks bytes
-+
-+# L1 H-filter (-r0*, +rb9, +rb10, -ra11)
-+
-+ add r5rep, r5, 1 ; mul24 r2, ra1.8a, r0
-+ sub r2, rb_fir_off_h, r2 ; mul24 r3, ra1.8d, r1
-+ sub r2, r2, r3 ; mul24 r3, ra1.8b << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8b << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r3, r2 ; mul24 r3, ra1.8c << 4, r0 << 4 @ "mul_used", 0
-+ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+
-+ brr.anyn -, r:1b
-+ add r2, r2, r3 ; mul24 r0, rb9, ra3.8a
-+ mov rb9, rb10 ; mul24 r1, rb10, ra3.8b
-+ shr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+# >>> .anyn 1b
-+
-+ sub r2, r1, r0 ; mul24 r1, rb5, ra2.8b # L1 ; L0
-+ sub.setf -, r5, rb_lcount ; mov r0, ra4
-+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+ add r1, r1, r0 ; mul24 r0, ra7, rb7
-+
-+ sub r1, r1, r0 ; mul24 r0, rb10, ra3.8c # L1
-+ add r2, r2, r0 ; mul24 r0, ra11, rb11 # L1
-+ sub r2, r2, r0
-+
-+ shr r1, r1, 6
-+ shr r2, r2, 6 ; mul24 r0, r1, ra_wt_mul_l0
-+ add r2, r2, r1 ; mul24 r1, r2, ra_wt_mul_l1
-+ add r1, r1, r0 ; mul24 r2, r2, ra_kmul_add
-+ sub r1, r1, r2 ; mov r3, ra_blk_height # ; NxtLoop
-+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3 # ; NxtLoop
-+
-+ brr.anyn -, r:1b
-+ asr r1, r1, ra_wt_den_p7
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
-+# >>> .anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0
-+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride
-+ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ brr -, r:1b
-+ add rb_lcount, rb_lcount, r0
-+ add ra_dma0, ra_dma0, r1
-+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_c_b
-+ m_filter_c_b 8
-+
-+################################################################################
-+# Exit code used by both Luma & Chroma so place between them to avoid I-cache
-+# conflicts
-+
-+.macro m_exit_drain
-+.if PREREAD == 2
-+# Special case 2 as loop is wasteful
-+ nop ; nop ; ldtmu0
-+ nop ; nop ; ldtmu1
-+ nop ; nop ; ldtmu0
-+ mov -, vw_wait ; nop ; ldtmu1
-+.else
-+ mov.setf r3, PREREAD - 1
-+:1
-+ brr.anynz -, r:1b
-+ nop ; nop ; ldtmu0
-+ nop ; nop ; ldtmu1
-+ sub.setf r3, r3, 1
-+ # >>>
-+ mov -, vw_wait
-+.endif
-+.endm
-+
-+# This sync layout groups QPUs 0-3, 4-7, 8-11 (i.e. 1 group per TMU pair)
-+# All qpus start at the beginning and after that (group - 1) must have finished
-+# before (group) can start
-+#
-+# Requires setup code for QPU 0 to srel sem 12 (m_setup_q0) to start the chain
-+# Exit code will sacq sem 12 so everything is @ 0 on exit (this is important -
-+# lockup otherwise)
-+#
-+# There is some, currently ill defined, potential lockup if we have the VDM active
-+# whilst doing sem stuff so we wait first. ?? QPU stall from sem stalls VDM pipe too ??
-+#
-+# The code stalled when I had many waiters on a single sem so we have a
-+# "ripple" of srels to restart. Unsure why, may have been bug, but this works
-+# and we currently have both the memory & sems to support it.
-+.macro m_sync_q, n_qpu, n_quads
-+# Do not generate code for qpu >= quads * 4 - fns should never be called
-+.if n_qpu < n_quads * 4
-+ mov ra_link, unif # Can only branch to an a reg (not r0)
-+ mov -, vw_wait # [ra_link delay]
-+
-+.set n_sem_sync, n_qpu - (n_qpu % 4)
-+.set n_sem_in, n_qpu
-+.set n_sem_out, n_qpu + 1
-+
-+.if n_qpu % 4 == 0
-+
-+.set n_sem_quad_in, 12 + n_qpu / 4
-+.set n_sem_quad_out, 12 + (((n_qpu / 4) + 1) % n_quads)
-+
-+ sacq -, n_sem_sync
-+ sacq -, n_sem_sync
-+ sacq -, n_sem_sync
-+ bra -, ra_link
-+ sacq -, n_sem_quad_in
-+ srel -, n_sem_out
-+ srel -, n_sem_quad_out
-+
-+.else
-+ bra -, ra_link
-+ srel -, n_sem_sync
-+ sacq -, n_sem_in
-+.if n_sem_out % 4 != 0
-+ srel -, n_sem_out
-+.else
-+ nop
-+.endif
-+.endif
-+.endif
-+.endm
-+
-+.set v_quads8, N_QPU_8 / 4
-+
-+::mc_sync_q0
-+ m_sync_q 0, v_quads8
-+::mc_sync_q1
-+ m_sync_q 1, v_quads8
-+::mc_sync_q2
-+ m_sync_q 2, v_quads8
-+::mc_sync_q3
-+ m_sync_q 3, v_quads8
-+::mc_sync_q4
-+ m_sync_q 4, v_quads8
-+::mc_sync_q5
-+ m_sync_q 5, v_quads8
-+::mc_sync_q6
-+ m_sync_q 6, v_quads8
-+::mc_sync_q7
-+ m_sync_q 7, v_quads8
-+::mc_sync_q8
-+ m_sync_q 8, v_quads8
-+::mc_sync_q9
-+ m_sync_q 9, v_quads8
-+::mc_sync_q10
-+ m_sync_q 10, v_quads8
-+::mc_sync_q11
-+ m_sync_q 11, v_quads8
-+
-+# mc_exit()
-+# Chroma & Luma the same now
-+
-+.macro m_exit_qn
-+ m_exit_drain
-+ nop ; nop ; thrend
-+ nop
-+ nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_qn
-+::mc_exit_y_qn
-+ m_exit_qn
-+
-+
-+
-+# mc_interrupt_exit12()
-+
-+.macro m_exit_q0
-+ m_exit_drain
-+ sacq -, 12
-+ nop ; nop ; thrend
-+ mov interrupt, 1
-+ nop
-+# >>> thrend <<<
-+.endm
-+
-+::mc_exit_c_q0
-+::mc_exit_y_q0
-+ m_exit_q0
-+
-+# LUMA CODE
-+
-+# The idea is to form B predictions by doing 8 pixels from ref0 in parallel with 8 pixels from ref1.
-+# For P frames we make the second x,y coordinates offset by +8
-+
-+
-+################################################################################
-+# mc_setup
-+#
-+# typedef struct qpu_mc_pred_y_s_s {
-+# qpu_mc_src_t next_src1;
-+# qpu_mc_src_t next_src2;
-+# uint16_t pic_h;
-+# uint16_t pic_w;
-+# uint32_t stride2;
-+# uint32_t stride1;
-+# uint32_t wdenom;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_y_s_t;
-+
-+.macro m_setup_y, v_bit_depth
-+
-+# Cannot use mul24 on x as x might be -ve, so must use shift
-+.if v_bit_depth <= 8
-+.set v_x_shift, 0
-+.set v_pmask, 0xff
-+.set v_blk_height, Y_BLK_HEIGHT_8
-+.else
-+.set v_x_shift, 1
-+.set v_pmask, 0xffff
-+.set v_blk_height, Y_BLK_HEIGHT_16
-+.endif
-+
-+
-+ # Need to save these because we need to know the frame dimensions before computing texture coordinates
-+ mov tmurs, 1 ; mov ra0, unif # No TMU swap ; x_y
-+ mov ra9, unif # ref_y_base
-+ mov ra1, unif # x2_y2
-+
-+
-+# load constants
-+ mov r0, [0,2,0,2,0,2,0,2,1,3,1,3,1,3,1,3]
-+ shl rb_ef, r0, i_shift30 ; mov ra11, unif # ; ref_y2_base
-+
-+ mov ra_kff800100, 0xff800100
-+ mov rb_pmask, v_pmask
-+ mov ra_blk_height_pmax, ((1 << v_bit_depth) - 1) | (v_blk_height << 16)
-+ mov rb_fir_off_h, (FIR_OFFSET << (v_bit_depth - 8))
-+ mov ra_fir_off_val_wt_den_p7, (FIR_OFFSET << 16) | (DENOM + 15 - v_bit_depth)
-+ mov rb_y_coeffs_2, 0x050b0a00
-+ mov rb_y_coeffs_3, 0x11283a40
-+ mov rb_y_coeffs_5, 0x0a0b0500
-+
-+# Compute part of VPM to use
-+
-+# Read image dimensions
-+ mov ra3, unif # width_height
-+ mov ra_ef, rb_ef ; mov rb_xpitch, unif # [ra3 delay] ; stride2
-+.if v_x_shift == 0
-+ sub rb_max_x, ra3.16b, 1
-+.else
-+ sub r0, ra3.16b, 1
-+ shl rb_max_x, r0, v_x_shift
-+.endif
-+ sub rb_max_y, ra3.16a, 1
-+ mov r3, elem_num ; mov rb_pitch, unif # stride1
-+
-+# get destination pitch
-+ mov r1, vdw_setup_1(0) # [rb_pitch delay]
-+ or rb_dma1_base, r1, rb_pitch
-+
-+# Compute base address for first and second access
-+ add r0, ra0.16b, r3 # Load x + elem_num
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, 0
-+ min r0, r0, rb_max_x
-+ shl ra_xshift_next, r0, 3 # Compute shifts
-+
-+# X is byte offset - we can only load words - mask
-+
-+ and r0, r0, -4 ; v8subs r2, r2, r2
-+ sub r2, r2, rb_pitch
-+ and r1, r0, r2
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 # Add stripe offsets
-+ add ra_base, ra9, r0
-+
-+ # r3 still contains elem_num
-+ add r0, ra1.16b, r3 # Load x
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, 0
-+ min r0, r0, rb_max_x
-+ shl rb_xshift2_next, r0, 3 # Compute shifts
-+
-+ # r2 still contains mask
-+ and r0, r0, -4
-+ and r1, r0, r2
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 # Add stripe offsets
-+ add rb_base2, ra11, r0
-+
-+# Do preloads
-+ nop ; mov r0, ra0.16a # ; r0 = y
-+ mov r3, PREREAD ; mov r2, ra1.16a # ; r2 = y2
-+
-+:1
-+ sub.setf r3, r3, 1
-+ max r1, r0, 0
-+ min r1, r1, rb_max_y
-+ add r0, r0, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t0s, ra_base, r1 ; mov ra_y, r0
-+
-+ max r1, r2, 0
-+ brr.anynz -, r:1b
-+ min r1, r1, rb_max_y
-+ add r2, r2, ra_k1 ; mul24 r1, r1, rb_pitch
-+ add t1s, rb_base2, r1 ; mov ra_y2, r2
-+# >>> .anynz 1b
-+
-+ m_calc_dma_regs v_bit_depth, v_blk_height, rb_vpm_init, rb_dma0_base
-+
-+ mov ra_link, unif # Next fn
-+
-+# touch vertical context to keep simulator happy
-+ mov ra8, 0 ; mov rb8, 0 # [ra_link delay]
-+ bra -, ra_link
-+ mov ra9, 0 ; mov rb9, 0
-+ mov ra10, 0 ; mov rb10, 0
-+ mov ra11, 0 ; mov rb11, 0
-+# >>> ra_link
-+.endm
-+
-+::mc_setup_y_q0
-+ m_setup_q0
-+::mc_setup_y_qn
-+ m_setup_y 8
-+
-+################################################################################
-+#
-+# Start of per-block setup code
-+# P and B blocks share the same setup code to save on Icache space
-+
-+# get base addresses and per-channel shifts for *next* invocation
-+# per-channel shifts were calculated on the *previous* invocation
-+
-+# 1st 3 instructions of per_block-setup in branch delay
-+#
-+# typedef struct qpu_mc_pred_y_p_s {
-+# qpu_mc_src_t next_src1;
-+# qpu_mc_src_t next_src2;
-+# uint16_t h;
-+# uint16_t w;
-+# uint32_t mymx21;
-+# uint32_t wo1;
-+# uint32_t wo2;
-+# uint32_t dst_addr;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_y_p_t;
-+#
-+
-+.macro m_luma_setup, v_bit_depth
-+# Hack - QASM may well have have label pasting but I have no idea how...
-+.if v_bit_depth == 8
-+ brr ra_link, r:per_block_setup_8
-+.elif v_bit_depth == 10
-+ brr ra_link, r:per_block_setup_10
-+.endif
-+ mov ra0, unif ; mov r3, elem_num # y_x ; elem_num has implicit unpack??
-+ add.setf -, rb_ef, rb_ef ; v8subs r5rep, r2, r2 # [ra0 delay] ; r5 = 0
-+ add r0, ra0.16b, r3 ; mov rb_xshift2, rb_xshift2_next
-+.endm
-+
-+.macro m_per_block_setup, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 0
-+.set v_x_mul, 1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 1
-+.set v_x_mul, 2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
-+
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, r5 ; mov ra_xshift, ra_xshift_next
-+ min r0, r0, rb_max_x
-+
-+ shl ra_xshift_next, r0, 3 # Compute shifts
-+ and r0, r0, -4
-+ sub r2, r5, rb_pitch ; mov ra_base_next, unif # ; src1.base
-+ and r1, r0, r2 ; mov ra_y_next, ra0.16a
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra1, unif # Add stripe offsets ; src2.x_y
-+ add ra_base_next, ra_base_next, r0 # [ra1 delay]
-+
-+ add r0, ra1.16b, r3 # Load x2
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+ max r0, r0, r5 ; mov ra_y2_next, ra1.16a
-+ min r0, r0, rb_max_x ; mov rb_base2_next, unif # ; src2.base
-+ shl rb_xshift2_next, r0, 3 # Compute shifts
-+ and r0, r0, -4 ; mov ra_width_height, unif # ; width_height
-+ and r1, r0, r2 ; mov vw_setup, rb_vpm_init # ; set up VPM write
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mul24 r1, ra_width, v_x_mul # Add stripe offsets ; r1 = x in bytes
-+ add rb_base2_next, rb_base2_next, r0
-+
-+# get width,height of block (unif load above), r1 = width * pel_size
-+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height # Compute vdw_setup1(dst_pitch-width)
-+ add rb_i_tmu, r0, (7-8) - PREREAD ; v8min r0, r0, ra_blk_height
-+ add rb_lcount, r0, (7-8)
-+ shl r0, r0, v_dma_h_shift ; mov r3, ra_kmul_add # ; r3 return val
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl r0, r0, v_dma_wh_shift ; mov r2, ra_fir_off_val # Shift into bits 16 upwards of the vdw_setup0 register ; r2 return val
-+ add ra_dma0, r0, rb_dma0_base ; mov r0, unif # ; Packed filter offsets
-+
-+# get filter coefficients and discard unused B frame values
-+ shl.ifnn r0, r0, i_shift16 ; mov ra_wt_off_mul_l0, unif # Pick half to use ; L0 offset/weight
-+ shl ra8, r0, 3 ; mov rb5, ra_k255
-+
-+# Coeffs are all abs values here as that means mul24 works (no sign extend from .8)
-+
-+# 2nd half coeffs same as first if we can swap 8<->24 in the rotate val
-+# but I can't see a way of doing that that is cheap enough to be worth it
-+
-+# Picked out in a slightly random order to space out uniform loads
-+
-+ # 1
-+ mov r1, 0x01040400 # [ra8 delay]
-+ ror ra2.8b, r1, ra8.8d
-+ ror ra0.8b, r1, ra8.8c
-+ # 2
-+ ror ra2.8c, rb_y_coeffs_2, ra8.8d
-+ ror ra0.8c, rb_y_coeffs_2, ra8.8c
-+ # 0
-+ mov r1,0x00010100 # -ve [ra8 delay]
-+ ror r0, r1, ra8.8d ; mov ra_wt_off_mul_l1, unif # ; L1 Wt/Offset
-+ ror ra0.8a, r1, ra8.8c ; v8min rb4, r0, rb5
-+ # 7
-+ shl r1, r1, 8 ; mov.ifn ra_wt_off_mul_l0, ra_wt_off_mul_l1 # r1 = 0x01010000
-+ ror r0, r1, ra8.8d ; mov ra_dest, unif # ; Destination address
-+ ror ra1.8d, r1, ra8.8c ; v8min rb11, r0, rb5
-+ # 3
-+ ror ra2.8d, rb_y_coeffs_3, ra8.8d
-+ ror ra0.8d, rb_y_coeffs_3, ra8.8c
-+ # 5
-+ ror ra3.8b, rb_y_coeffs_5, ra8.8d
-+ ror ra1.8b, rb_y_coeffs_5, ra8.8c
-+ # 6
-+ mov r1,0x04040100
-+ ror ra3.8c, r1, ra8.8d
-+ ror ra1.8c, r1, ra8.8c ; mov r5rep, -8 # ; r5 return val
-+
-+ bra -, ra_link
-+ # 4
-+ mov r1,0x3a281100
-+ ror r0, r1, ra8.8d ; mov ra_link, unif # ; link - load after we've used its previous val
-+ ror ra1.8a, r1, ra8.8c ; v8min rb8, r0, rb5
-+# >>> branch ra_link
-+
-+# r5 = -8
-+# r2 = fir_off_val
-+# r3 = 128
-+.endm
-+
-+:per_block_setup_8
-+ m_per_block_setup 8
-+
-+
-+
-+################################################################################
-+#
-+# mc_filter_y_pxx
-+#
-+# Setup (& therefore uniform struct) shared with _bxx
-+# Struct in m_luma_setup
-+#
-+# We can have 2 separate P reqs here as long as they mate to generate a
-+# rectangular output block (i.e. h0 = h1, w0 = 8)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_pxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
-+
-+ m_luma_setup v_bit_depth
-+
-+ shl r1, ra_wt_off_l0, i_wt_den_p5
-+ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0 # r2 = 0x4000 so mul24 safe even with -ve wt_mul
-+ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
-+
-+# retrieve texture results and pick out bytes
-+# then submit two more texture requests
-+
-+# This loop is identical to the B loop from here --->
-+:1
-+ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
-+
-+ max r2, ra_y, 0 ; mov r1, 0
-+ min r2, r2, rb_max_y ; mov r3, ra_k1
-+ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
-+ add t0s, ra_base, r2 ; mov rb5, rb6
-+ shr r0, r4, ra_xshift ; mov rb6, rb7
-+
-+ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes
-+ shr r1, r4, rb_xshift2 ; mov rb7, ra8
-+ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
-+ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
-+ add t1s, rb_base2, r2 ; mov ra8, ra9
-+
-+# apply horizontal filter
-+ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
-+ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+ brr.anyn -, r:1b
-+ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
-+ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+ # >>> .anyn 1b (r5 + r5)
-+
-+ # apply vertical filter and write to VPM
-+ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+ add r1, r1, r0 ; mul24 r0, ra8, rb8
-+ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+ add r1, r1, r0 ; mul24 r0, ra11, rb11
-+# <--- to here
-+ sub.setf -, r5, rb_i_tmu ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height
-+ sub r1, r1, ra4 ; mov.ifz rb_base2, rb_base2_next
-+ sub r1, r1, r0 ; mov.ifz ra_base, ra_base_next
-+
-+ asr r1, r1, 6 ; mov.ifz ra_y_y2, ra_y_y2_next
-+ sub.setf -, r5, rb_lcount ; mul24 r0, r1, ra_wt_mul_l0
-+ add r0, r0, rb_wt_off ; mul24 r1, r1, ra_kmul_add
-+ sub r1, r0, r1 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate)
-+
-+ brr.anyn -, r:1b
-+ asr r1, r1, i_wt_den_p6
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch # ; NxtLoop
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ brr -, r:1b
-+ add rb_lcount, rb_lcount, r0
-+ add ra_dma0, ra_dma0, r1
-+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_pxx
-+ m_filter_y_pxx 8
-+
-+
-+################################################################################
-+
-+# mc_filter_b(y_x, base, y2_x2, base2, width_height, my2_mx2_my_mx, offsetweight0, this_dst, next_kernel)
-+#
-+# Setup (& therefore uniform struct) shared with _pxx
-+# Struct in m_luma_setup
-+#
-+# l0 calc in els 0-7, L1 in 8-15
-+# Only els 0-7 write data that is stored back to ram (els 8-15 may write tosh)
-+#
-+# At this point we have already issued PREREAD pairs of texture requests for the current block
-+
-+.macro m_filter_y_bxx, v_bit_depth
-+
-+# denom shift values
-+.set i_wt_den_p5, (DENOM + 13 - v_bit_depth)
-+.set i_wt_den_p6, (DENOM + 14 - v_bit_depth)
-+
-+ m_luma_setup v_bit_depth
-+
-+ shl r1, ra_wt_off_l0, i_wt_den_p6
-+ add ra_wt_mul_l0, ra_wt_mul_l0, r3 ; mul24 r0, r2, ra_wt_mul_l0
-+ sub r1, r1, r0 ; mul24 r0, r2, ra_wt_mul_l1
-+ sub rb_wt_off, r1, r0 ; mov ra_ef.8a, rb4
-+
-+# This loop is identical to the P loop from here --->
-+:1
-+ add.setf -, ra_ef, ra_ef ; mul24 ra4, rb5, ra_ef
-+
-+ max r2, ra_y, 0 ; mov r1, 0
-+ min r2, r2, rb_max_y ; mov r3, ra_k1
-+ add ra_y, ra_y, r3 ; mul24 r2, r2, rb_pitch ; ldtmu0
-+ add t0s, ra_base, r2 ; mov rb5, rb6
-+ shr r0, r4, ra_xshift ; mov rb6, rb7
-+
-+ max r2, ra_y2, r1 ; v8min r0, r0, rb_pmask ; ldtmu1 # ; masks out all but wanted bytes
-+ shr r1, r4, rb_xshift2 ; mov rb7, ra8
-+ min r2, r2, rb_max_y ; v8min r1, r1, ra_pmax
-+ add ra_y2, ra_y2, r3 ; mul24 r2, r2, rb_pitch
-+ add t1s, rb_base2, r2 ; mov ra8, ra9
-+
-+# apply horizontal filter
-+ add r5rep, r5, r3 ; mul24 r2, ra0.8a << 8, r1 << 8 @ "mul_used", 0
-+ mov r3, rb_fir_off_h ; mul24.ifnn r2, ra0.8a, r0
-+ sub r2, r3, r2 ; mul24 r3, ra0.8b << 1, r0 << 1 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8b << 9, r1 << 9 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra0.8c << 2, r0 << 2 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8c << 10, r1 << 10 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra0.8d << 3, r0 << 3 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra0.8d << 11, r1 << 11 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8a << 4, r0 << 4 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8a << 12, r1 << 12 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8b << 5, r0 << 5 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8b << 13, r1 << 13 @ "mul_used", 0
-+ sub r2, r2, r3 ; mul24 r3, ra1.8c << 6, r0 << 6 @ "mul_used", 0
-+ nop ; mul24.ifn r3, ra1.8c << 14, r1 << 14 @ "mul_used", 0
-+ add r2, r2, r3 ; mul24 r3, ra1.8d << 7, r0 << 7 @ "mul_used", 0
-+ add.setf -, r5, r5 ; mul24.ifn r3, ra1.8d << 15, r1 << 15 @ "mul_used", 0
-+
-+ brr.anyn -, r:1b
-+ sub r2, r2, r3 ; mul24 r1, rb5, ra2.8b
-+ mov ra9, rb10 ; mul24 r0, rb10, ra3.8b
-+ asr ra11, r2, v_bit_depth - 8 ; mov rb10, ra11
-+ # >>> .anyn 1b (r5 + r5)
-+
-+ # apply vertical filter and write to VPM
-+ # - r4* + r5 - r6 + r7 + r8 - r9 + r10 - r11
-+
-+ sub r1, r1, r0 ; mul24 r0, rb6, ra2.8c
-+ sub r1, r1, r0 ; mul24 r0, rb7, ra2.8d
-+ add r1, r1, r0 ; mul24 r0, ra8, rb8
-+ add r1, r1, r0 ; mul24 r0, rb10, ra3.8c
-+ add r1, r1, r0 ; mul24 r0, ra11, rb11
-+# <--- to here
-+ sub r1, r1, ra4
-+ sub r1, r1, r0 ; mov r2, rb_wt_off
-+
-+ asr r1, r1, 6
-+ sub.setf -, r5, rb_i_tmu ; mul24 r0, r1, ra_wt_mul_l0
-+ mov.ifz rb_base2, rb_base2_next ; mul24 r1, r1, ra_kmul_add
-+ sub r1, r0, r1 ; mov.ifz ra_y_y2, ra_y_y2_next
-+ sub.setf -, r5, rb_lcount ; mov.ifz ra_base, ra_base_next
-+ add r1, r1, r2 ; mov r0, r1 << 8
-+ add r1, r1, r0 ; mov r3, ra_blk_height # ; NxtLoop: r3 = block height
-+
-+ brr.anyn -, r:1b
-+ asr r1, r1, ra_wt_den_p7 ; mul24 r2, r3, rb_pitch # ; NxtLoop
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, 0 ; v8subs r0, ra_height, r3 # ; NxtLoop: r0 = remaining height (0 saturate)
-+# >>> branch.anyn 1b (r5 - rb_lcount)
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed block_height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link (ra_height - remaining height)
-+
-+# Here r1 = cur_blk_height - blk_height so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ brr -, r:1b
-+ add rb_lcount, rb_lcount, r0
-+ add ra_dma0, ra_dma0, r1
-+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_bxx
-+ m_filter_y_bxx 8
-+
-+################################################################################
-+#
-+# typedef struct qpu_mc_pred_y_p00_s {
-+# qpu_mc_src_t next_src1;
-+# uint16_t h;
-+# uint16_t w;
-+# uint32_t wo1;
-+# uint32_t dst_addr;
-+# uint32_t next_fn;
-+# } qpu_mc_pred_y_p00_t;
-+
-+.macro m_filter_y_p00, v_bit_depth
-+
-+.if v_bit_depth <= 8
-+.set v_x_shift, 0
-+.set v_x_mul, 1
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift, 7
-+.set v_dma_wh_shift, i_shift16
-+.else
-+.set v_x_shift, 1
-+.set v_x_mul, 2
-+# Shifts to get width & height in the right place in ra_dma0
-+.set v_dma_h_shift, 8
-+.set v_dma_wh_shift, 15
-+.endif
-+
-+ mov ra0, unif ; mov r0, elem_num # y_x
-+ mov ra_xshift, ra_xshift_next ; v8subs r5rep, r5, r5 # [ra0 delay] ; r5 = 0
-+ add r0, ra0.16b, r0 ; mov ra_base_next, unif # ; src1.base
-+.if v_x_shift != 0
-+ shl r0, r0, v_x_shift
-+.endif
-+
-+ max r0, r0, r5 ; mov ra_y_next, ra0.16a # ; width_height
-+ min r0, r0, rb_max_x ; mov ra_width_height, unif
-+
-+ shl ra_xshift_next, r0, 3 # Compute shifts
-+ and r0, r0, -4
-+ sub r2, r5, rb_pitch ; mov ra_wt_off_mul_l0, unif # ; weight_offset
-+ and r1, r0, r2
-+ xor r0, r0, r1 ; mul24 r1, r1, rb_xpitch
-+ add r0, r0, r1 ; mov ra_dest, unif # Add stripe offsets ; dest addr
-+ add ra_base_next, ra_base_next, r0 ; mov vw_setup, rb_vpm_init # [ra_width delay] ; set up VPM write
-+
-+# get width,height of block (unif load above)
-+# Compute vdw_setup1(dst_pitch-width)
-+ shl r1, ra_width, v_x_shift
-+ sub rb_dma1, rb_dma1_base, r1 ; mov r0, ra_height
-+ sub rb_i_tmu, r0, PREREAD ; v8min r0, r0, ra_blk_height
-+ shl r0, r0, v_dma_h_shift ; mov rb_lcount, r0
-+ add r0, r0, r1 # Combine width and height of destination area
-+ shl rb_wt_off, ra_wt_off_l0, DENOM + 7
-+ shl r0, r0, v_dma_wh_shift ; mov ra_link, unif # Shift into bits 16 upwards of the vdw_setup0 register ; link
-+ add ra_dma0, r0, rb_dma0_base
-+
-+:1
-+ sub.setf -, r5, rb_i_tmu ; v8adds r5rep, r5, ra_k1
-+ nop ; mov.ifz ra_y, ra_y_next ; ldtmu0
-+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+
-+ max r2, ra_y, 0 # y
-+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_base, r2 ; v8min r0, r0, rb_pmask
-+
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r0, ra_wt_mul_l0
-+ shl r1, r1, 8 ; mov r3, ra_blk_height
-+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+
-+ brr.anyn -, r:1b
-+ asr r1, r1, DENOM + 8
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # VDW setup 0
-+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # Stride
-+ sub r1, r0, r3 ; mov vw_addr, ra_dest # start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ brr -, r:1b
-+ add rb_lcount, rb_lcount, r0
-+ add ra_dma0, ra_dma0, r1
-+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_p00
-+ m_filter_y_p00 8
-+
-+################################################################################
-+
-+.macro m_filter_y_b00, v_bit_depth
-+# luma setup does a fair bit more than we need calculating filter coeffs
-+# that we will never use but it saves I-cache to use it (also simple!)
-+ m_luma_setup v_bit_depth
-+
-+# Fix up vals that were expecting a filter (somewhat icky)
-+ mov r2, 1
-+ add rb_i_tmu, rb_i_tmu, r2 ; mov r1, ra_wt_off_mul_l0 # Need in rX rather than raX for <<8 to do what we want
-+ shl rb_wt_off, ra_wt_off_l0, DENOM + 8 ; v8subs r5quad, r5, r5 # [r1 << delay] ; r5quad OK for zero
-+ nop ; mov.ifnz ra_wt_off_mul_l0, r1 << 8
-+
-+:1
-+ sub.setf -, r5, rb_i_tmu ; nop ; ldtmu1
-+ shr r1, r4, rb_xshift2 ; mov.ifz ra_y_y2, ra_y_y2_next ; ldtmu0
-+ shr r0, r4, ra_xshift ; mov r3, rb_pitch
-+
-+ max r2, ra_y, 0 # y
-+ min r2, r2, rb_max_y ; mov.ifz ra_base, ra_base_next
-+ add ra_y, ra_y, 1 ; mul24 r2, r2, r3
-+ add t0s, ra_base, r2 ; mov.ifz rb_base2, rb_base2_next
-+
-+ max r2, ra_y2, 0
-+ min r2, r2, rb_max_y
-+ add ra_y2, ra_y2, 1 ; mul24 r2, r2, r3
-+ add t1s, rb_base2, r2 ; v8min r0, r0, ra_pmax # v8subs masks out all but bottom byte
-+ and r1, r1, rb_pmask ; mul24 r0, r0, ra_wt_mul_l0
-+
-+ sub.setf -, r5, rb_lcount ; mul24 r1, r1, ra_wt_mul_l1
-+ add r1, r0, r1 ; v8adds r5rep, r5, ra_k1
-+
-+ shl r1, r1, 8 ; mov r3, ra_blk_height
-+ add r1, r1, rb_wt_off ; v8subs r0, ra_height, r3
-+
-+ brr.anyn -, r:1b
-+ asr r1, r1, (DENOM + 9) - 32 # -32 to get valid shift immediate
-+ min r1, r1, ra_pmax ; mov -, vw_wait
-+ max vpm, r1, ra_k0 ; mul24 r2, r3, rb_pitch
-+# >>> branch.anyn 1b
-+
-+# r0 = remaining height (min 0)
-+# r2 = r3 * rb_pitch
-+# r3 = block_height
-+
-+# If looping again then we consumed 16 height last loop
-+# rb_dma1 (stride) remains constant
-+# rb_i_tmu remains const (based on total height)
-+# recalc ra_dma0, rb_lcount based on new segment height
-+
-+ mov.setf ra_height, r0 ; mov vw_setup, ra_dma0 # ; VDW setup 0
-+
-+# DMA out
-+ bra.anyz -, ra_link
-+ min r0, r0, r3 ; mov vw_setup, rb_dma1 # ; Stride
-+ sub r1, r0, r3 ; mov vw_addr, ra_dest # ; start the VDW
-+ shl r1, r1, i_shift23
-+# >>> .anyz ra_link
-+
-+# Here r1 = cur_blk_height - 16 so it will be 0 or -ve
-+# We add to dma0 to reduce the number of output lines in the final block
-+ brr -, r:1b
-+ add rb_lcount, rb_lcount, r0
-+ add ra_dma0, ra_dma0, r1
-+ add ra_dest, ra_dest, r2 ; mov vw_setup, rb_vpm_init # ; Reset our VDM write pointer
-+# >>> 1b
-+.endm
-+
-+::mc_filter_y_b00
-+ m_filter_y_b00 8
-+
-+################################################################################
-+################################################################################
-+# 10 BIT
-+
-+::mc_setup_c10_q0
-+ m_setup_q0
-+::mc_setup_c10_qn
-+ m_setup_c 10
-+
-+::mc_filter_c10_p
-+ m_filter_c_p 0, 10
-+
-+::mc_filter_c10_p_l1
-+ m_filter_c_p 1, 10
-+
-+
-+::mc_filter_c10_b
-+ m_filter_c_b 10
-+
-+# Even if these fns are the same as for other bit depths we want our own copy
-+# to keep the code we are using in a single lump to avoid (direct map) cache
-+# thrashing
-+.set v_quads10, N_QPU_16 / 4
-+
-+::mc_sync10_q0
-+ m_sync_q 0, v_quads10
-+::mc_sync10_q1
-+ m_sync_q 1, v_quads10
-+::mc_sync10_q2
-+ m_sync_q 2, v_quads10
-+::mc_sync10_q3
-+ m_sync_q 3, v_quads10
-+::mc_sync10_q4
-+ m_sync_q 4, v_quads10
-+::mc_sync10_q5
-+ m_sync_q 5, v_quads10
-+::mc_sync10_q6
-+ m_sync_q 6, v_quads10
-+::mc_sync10_q7
-+ m_sync_q 7, v_quads10
-+::mc_sync10_q8
-+ m_sync_q 8, v_quads10
-+::mc_sync10_q9
-+ m_sync_q 9, v_quads10
-+::mc_sync10_q10
-+ m_sync_q 10, v_quads10
-+::mc_sync10_q11
-+ m_sync_q 11, v_quads10
-+
-+::mc_exit_y10_q0
-+::mc_exit_c10_q0
-+ m_exit_q0
-+
-+::mc_exit_y10_qn
-+::mc_exit_c10_qn
-+ m_exit_qn
-+
-+::mc_setup_y10_q0
-+ m_setup_q0
-+::mc_setup_y10_qn
-+ m_setup_y 10
-+
-+:per_block_setup_10
-+ m_per_block_setup 10
-+
-+::mc_filter_y10_pxx
-+ m_filter_y_pxx 10
-+
-+::mc_filter_y10_p00
-+ m_filter_y_p00 10
-+
-+::mc_filter_y10_bxx
-+ m_filter_y_bxx 10
-+
-+::mc_filter_y10_b00
-+ m_filter_y_b00 10
-+
-+
-+
-+::mc_end
-+# Do not add code here because mc_end must appear after all other code.
-diff --git a/libavcodec/rpi_hevc_shader_cmd.h b/libavcodec/rpi_hevc_shader_cmd.h
-new file mode 100644
-index 0000000000..2f06987bb9
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_cmd.h
-@@ -0,0 +1,128 @@
-+#ifndef RPI_SHADER_CMD_H
-+#define RPI_SHADER_CMD_H
-+
-+#pragma pack(push, 4)
-+
-+#if RPI_QPU_EMU_C && RPI_QPU_EMU_Y
-+// If mixed then we are just confused and get a lot of warnings....
-+typedef const uint8_t * qpu_mc_src_addr_t;
-+typedef uint8_t * qpu_mc_dst_addr_t;
-+#else
-+typedef uint32_t qpu_mc_src_addr_t;
-+typedef uint32_t qpu_mc_dst_addr_t;
-+#endif
-+
-+typedef struct qpu_mc_src_s
-+{
-+ int16_t y;
-+ int16_t x;
-+ qpu_mc_src_addr_t base;
-+} qpu_mc_src_t;
-+
-+
-+typedef struct qpu_mc_pred_c_p_s {
-+ qpu_mc_src_t next_src;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t coeffs_x;
-+ uint32_t coeffs_y;
-+ uint32_t wo_u;
-+ uint32_t wo_v;
-+ qpu_mc_dst_addr_t dst_addr_c;
-+ uint32_t next_fn;
-+} qpu_mc_pred_c_p_t;
-+
-+typedef struct qpu_mc_pred_c_b_s {
-+ qpu_mc_src_t next_src1;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t coeffs_x1;
-+ uint32_t coeffs_y1;
-+ int16_t weight_u1;
-+ int16_t weight_v1;
-+ qpu_mc_src_t next_src2;
-+ uint32_t coeffs_x2;
-+ uint32_t coeffs_y2;
-+ uint32_t wo_u2;
-+ uint32_t wo_v2;
-+ qpu_mc_dst_addr_t dst_addr_c;
-+ uint32_t next_fn;
-+} qpu_mc_pred_c_b_t;
-+
-+typedef struct qpu_mc_pred_c_s_s {
-+ qpu_mc_src_t next_src1;
-+ uint32_t pic_cw; // C Width (== Y width / 2)
-+ uint32_t pic_ch; // C Height (== Y Height / 2)
-+ uint32_t stride2;
-+ uint32_t stride1;
-+ qpu_mc_src_t next_src2;
-+ uint32_t next_fn;
-+} qpu_mc_pred_c_s_t;
-+
-+typedef struct qpu_mc_pred_c_s {
-+ union {
-+ qpu_mc_pred_c_p_t p;
-+ qpu_mc_pred_c_b_t b;
-+ qpu_mc_pred_c_s_t s;
-+ };
-+} qpu_mc_pred_c_t;
-+
-+
-+typedef struct qpu_mc_pred_y_p_s {
-+ qpu_mc_src_t next_src1;
-+ qpu_mc_src_t next_src2;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t mymx21;
-+ uint32_t wo1;
-+ uint32_t wo2;
-+ qpu_mc_dst_addr_t dst_addr;
-+ uint32_t next_fn;
-+} qpu_mc_pred_y_p_t;
-+
-+typedef struct qpu_mc_pred_y_p00_s {
-+ qpu_mc_src_t next_src1;
-+ uint16_t h;
-+ uint16_t w;
-+ uint32_t wo1;
-+ qpu_mc_dst_addr_t dst_addr;
-+ uint32_t next_fn;
-+} qpu_mc_pred_y_p00_t;
-+
-+typedef struct qpu_mc_pred_y_s_s {
-+ qpu_mc_src_t next_src1;
-+ qpu_mc_src_t next_src2;
-+ uint16_t pic_h;
-+ uint16_t pic_w;
-+ uint32_t stride2;
-+ uint32_t stride1;
-+ uint32_t next_fn;
-+} qpu_mc_pred_y_s_t;
-+
-+// Only a useful structure in that it allows us to return something other than a void *
-+typedef struct qpu_mc_pred_y_s {
-+ union {
-+ qpu_mc_pred_y_p_t p;
-+ qpu_mc_pred_y_p00_t p00;
-+ qpu_mc_pred_y_s_t s;
-+ };
-+} qpu_mc_pred_y_t;
-+
-+typedef union qpu_mc_pred_cmd_u {
-+ qpu_mc_pred_y_t y;
-+ qpu_mc_pred_c_t c;
-+ uint32_t data[1];
-+} qpu_mc_pred_cmd_t;
-+
-+#define QPU_MC_PRED_N_Y8 12
-+#define QPU_MC_PRED_N_C8 12
-+
-+#define QPU_MC_PRED_N_Y10 12
-+#define QPU_MC_PRED_N_C10 12
-+
-+#define QPU_MC_DENOM 7
-+
-+#pragma pack(pop)
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_shader_template.c b/libavcodec/rpi_hevc_shader_template.c
-new file mode 100644
-index 0000000000..577850a6b4
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.c
-@@ -0,0 +1,61 @@
-+#include "hevc.h"
-+#include "rpi_hevcdec.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+
-+typedef struct shader_track_s
-+{
-+ const union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+ const struct qpu_mc_src_s *last_l0;
-+ const struct qpu_mc_src_s *last_l1;
-+ uint32_t width; // pic_width * PW
-+ uint32_t height;
-+ uint32_t stride2;
-+ uint32_t stride1;
-+} shader_track_t;
-+
-+static int wtoidx(const unsigned int w)
-+{
-+ static const uint8_t pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+ return pel_weight[w];
-+}
-+
-+static const int fctom(uint32_t x)
-+{
-+ int rv;
-+ // As it happens we can take the 2nd filter term & divide it by 8
-+ // (dropping fractions) to get the fractional move
-+ rv = 8 - ((x >> 11) & 0xf);
-+ av_assert2(rv >= 0 && rv <= 7);
-+ return rv;
-+}
-+
-+static inline int32_t ext(int32_t x, unsigned int shl, unsigned int shr)
-+{
-+ return (x << shl) >> shr;
-+}
-+
-+static inline int woff_p(HEVCRpiContext *const s, int32_t x)
-+{
-+ return ext(x, 0, 17 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int woff_b(HEVCRpiContext *const s, int32_t x)
-+{
-+ return ext(x - 0x10000, 0, 16 + s->ps.sps->bit_depth - 8);
-+}
-+
-+static inline int wweight(int32_t x)
-+{
-+ return ext(x, 16, 16);
-+}
-+
-+
-+#define PW 1
-+#include "rpi_hevc_shader_template_fn.h"
-+
-+#undef PW
-+#define PW 2
-+#include "rpi_hevc_shader_template_fn.h"
-+
-diff --git a/libavcodec/rpi_hevc_shader_template.h b/libavcodec/rpi_hevc_shader_template.h
-new file mode 100644
-index 0000000000..304d73ea4a
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template.h
-@@ -0,0 +1,22 @@
-+#ifndef LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+#define LIBAVCODEC_RPI_SHADER_TEMPLATE_H
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiInterPredEnv;
-+
-+void ff_hevc_rpi_shader_c8(struct HEVCRpiContext *const s,
-+ const struct HEVCRpiInterPredEnv *const ipe_y,
-+ const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void ff_hevc_rpi_shader_c16(struct HEVCRpiContext *const s,
-+ const struct HEVCRpiInterPredEnv *const ipe_y,
-+ const struct HEVCRpiInterPredEnv *const ipe_c);
-+
-+void rpi_sand_dump8(const char * const name,
-+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+void rpi_sand_dump16(const char * const name,
-+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c);
-+
-+#endif
-+
-diff --git a/libavcodec/rpi_hevc_shader_template_fn.h b/libavcodec/rpi_hevc_shader_template_fn.h
-new file mode 100644
-index 0000000000..59b00d537b
---- /dev/null
-+++ b/libavcodec/rpi_hevc_shader_template_fn.h
-@@ -0,0 +1,475 @@
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+#define PATCH_STRIDE (16 * PW)
-+
-+static void FUNC(dup_lr)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+ for (unsigned int i = 0; i != h; ++i, dst += stride, src += stride) {
-+ const pixel s = *(const pixel *)src;
-+ pixel * d = (pixel *)dst;
-+ for (unsigned int j = 0; j < w; j += PW) {
-+ *d++ = s;
-+ }
-+ }
-+}
-+
-+static void FUNC(dup_tb)(uint8_t * dst, const uint8_t * src, unsigned int w, unsigned int h, unsigned int stride)
-+{
-+ for (unsigned int i = 0; i != h; ++i, dst += stride) {
-+ memcpy(dst, src, w);
-+ }
-+}
-+
-+static void FUNC(get_patch_y)(const shader_track_t * const st,
-+ uint8_t * dst, const unsigned int dst_stride,
-+ const qpu_mc_src_t *src,
-+ unsigned int _w, unsigned int _h)
-+{
-+ int x = src->x * PW;
-+ int y = src->y;
-+ int w = _w * PW;
-+ int h = _h;
-+ int dl = 0;
-+ int dr = 0;
-+ int dt = 0;
-+ int db = 0;
-+
-+ if (x < 0) {
-+ if (-x >= w)
-+ x = PW - w;
-+ dl = -x;
-+ w += x;
-+ x = 0;
-+ }
-+ if (x + w > st->width) {
-+ if (x >= st->width)
-+ x = st->width - PW;
-+ dr = (x + w) - st->width;
-+ w = st->width - x;
-+ }
-+
-+ // Y
-+ if (y < 0) {
-+ if (-y >= h)
-+ y = 1 - h;
-+ dt = -y;
-+ h += y;
-+ y = 0;
-+ }
-+ if (y + h > st->height) {
-+ if (y >= st->height)
-+ y = st->height - 1;
-+ db = (y + h) - st->height;
-+ h = st->height - y;
-+ }
-+
-+ dst += dl + dt * dst_stride;
-+ FUNC(av_rpi_sand_to_planar_y)(dst, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+ // Edge dup
-+ if (dl != 0)
-+ FUNC(dup_lr)(dst - dl, dst, dl, h, dst_stride);
-+ if (dr != 0)
-+ FUNC(dup_lr)(dst + w, dst + w - PW, dr, h, dst_stride);
-+ w += dl + dr;
-+ dst -= dl;
-+
-+ if (dt != 0)
-+ FUNC(dup_tb)(dst - dt * dst_stride, dst, w, dt, dst_stride);
-+ if (db != 0)
-+ FUNC(dup_tb)(dst + h * dst_stride, dst + (h - 1) * dst_stride, w, db, dst_stride);
-+}
-+
-+
-+
-+static void FUNC(get_patch_c)(const shader_track_t * const st,
-+ uint8_t * dst_u, uint8_t * dst_v, const unsigned int dst_stride,
-+ const qpu_mc_src_t *src,
-+ unsigned int _w, unsigned int _h)
-+{
-+ int x = src->x * PW;
-+ int y = src->y;
-+ int w = _w * PW;
-+ int h = _h;
-+ int dl = 0;
-+ int dr = 0;
-+ int dt = 0;
-+ int db = 0;
-+ const int width = st->width;
-+ const int height = st->height;
-+
-+ if (x < 0) {
-+ if (-x >= w)
-+ x = PW - w;
-+ dl = -x;
-+ w += x;
-+ x = 0;
-+ }
-+ if (x + w > width) {
-+ if (x >= width)
-+ x = width - PW;
-+ dr = (x + w) - width;
-+ w = width - x;
-+ }
-+
-+ // Y
-+ if (y < 0) {
-+ if (-y >= h)
-+ y = 1 - h;
-+ dt = -y;
-+ h += y;
-+ y = 0;
-+ }
-+ if (y + h > height) {
-+ if (y >= height)
-+ y = height - 1;
-+ db = (y + h) - height;
-+ h = height - y;
-+ }
-+
-+ dst_u += dl + dt * dst_stride;
-+ dst_v += dl + dt * dst_stride;
-+ FUNC(av_rpi_sand_to_planar_c)(dst_u, dst_stride, dst_v, dst_stride, (const uint8_t *)src->base, st->stride1, st->stride2, x, y, w, h);
-+
-+ // Edge dup
-+ if (dl != 0)
-+ {
-+ FUNC(dup_lr)(dst_u - dl, dst_u, dl, h, dst_stride);
-+ FUNC(dup_lr)(dst_v - dl, dst_v, dl, h, dst_stride);
-+ }
-+ if (dr != 0)
-+ {
-+ FUNC(dup_lr)(dst_u + w, dst_u + w - PW, dr, h, dst_stride);
-+ FUNC(dup_lr)(dst_v + w, dst_v + w - PW, dr, h, dst_stride);
-+ }
-+ w += dl + dr;
-+ dst_u -= dl;
-+ dst_v -= dl;
-+
-+ if (dt != 0)
-+ {
-+ FUNC(dup_tb)(dst_u - dt * dst_stride, dst_u, w, dt, dst_stride);
-+ FUNC(dup_tb)(dst_v - dt * dst_stride, dst_v, w, dt, dst_stride);
-+ }
-+ if (db != 0)
-+ {
-+ FUNC(dup_tb)(dst_u + h * dst_stride, dst_u + (h - 1) * dst_stride, w, db, dst_stride);
-+ FUNC(dup_tb)(dst_v + h * dst_stride, dst_v + (h - 1) * dst_stride, w, db, dst_stride);
-+ }
-+}
-+
-+// w, y, w, h in pixels
-+// stride1, stride2 in bytes
-+void FUNC(rpi_sand_dump)(const char * const name,
-+ const uint8_t * const base, const int stride1, const int stride2, int x, int y, int w, int h, const int is_c)
-+{
-+ const int mask = stride2 == 0 ? ~0 : stride1 - 1;
-+
-+ printf("%s (%d,%d) %dx%d\n", name, x, y, w, h);
-+
-+ if (is_c) {
-+ x *= 2;
-+ w *= 2;
-+ }
-+
-+ for (int i = y; i != y + h; ++i) {
-+ for (int j = x; j != x + w; ++j) {
-+ const uint8_t * p = base + ((j*PW) & mask) + i * stride1 + ((j*PW) & ~mask) * stride2;
-+ char sep = is_c && (j & 1) == 0 ? ':' : ' ';
-+#if PW == 1
-+ if (j < 0 || i < 0)
-+ printf("..%c", sep);
-+ else
-+ printf("%02x%c", *(const pixel*)p, sep);
-+#else
-+ if (j < 0 || i < 0)
-+ printf("...%c", sep);
-+ else
-+ printf("%03x%c", *(const pixel*)p, sep);
-+#endif
-+ }
-+ printf("\n");
-+ }
-+}
-+
-+
-+void FUNC(ff_hevc_rpi_shader_c)(HEVCRpiContext *const s,
-+ const HEVCRpiInterPredEnv *const ipe_y,
-+ const HEVCRpiInterPredEnv *const ipe_c)
-+{
-+ for (int c_idx = 0; c_idx < 2; ++c_idx)
-+ {
-+ const HEVCRpiInterPredEnv *const ipe = c_idx == 0 ? ipe_y : ipe_c;
-+ shader_track_t tracka[QPU_N_MAX] = {{NULL}};
-+ unsigned int exit_n = 0;
-+
-+ if (ipe == NULL || !ipe->used) {
-+ continue;
-+ }
-+
-+ do {
-+ for (unsigned int i = 0; i != ipe->n; ++i) {
-+ const HEVCRpiInterPredQ * const q = ipe->q + i;
-+ shader_track_t * const st = tracka + i;
-+ const qpu_mc_pred_cmd_t * cmd = st->qpu_mc_curr == NULL ? q->qpu_mc_base : st->qpu_mc_curr;
-+
-+ for (;;) {
-+ const uint32_t link = (cmd == q->qpu_mc_base) ? q->code_setup : ((uint32_t *)cmd)[-1];
-+
-+ if (link == q->code_setup) {
-+ if (c_idx == 0) {
-+ // Luma
-+ const qpu_mc_pred_y_s_t *const c = &cmd->y.s;
-+
-+ st->height = c->pic_h;
-+ st->width = c->pic_w * PW;
-+ st->stride1 = c->stride1;
-+ st->stride2 = c->stride2;
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else {
-+ // Chroma
-+ const qpu_mc_pred_c_s_t *const c = &cmd->c.s;
-+
-+ st->height = c->pic_ch;
-+ st->width = c->pic_cw * PW;
-+ st->stride1 = c->stride1;
-+ st->stride2 = c->stride2;
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ }
-+ else if (link == s->qpu.y_pxx) {
-+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+ const int w1 = FFMIN(c->w, 8);
-+ const int w2 = c->w - w1;
-+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h + 7);
-+ if (w2 > 0) {
-+ FUNC(get_patch_y)(st,
-+ patch_y2, PATCH_STRIDE,
-+ st->last_l1,
-+ 16, c->h + 7);
-+ }
-+
-+ // wo[offset] = offset*2+1
-+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w1)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y1 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), w1);
-+ if (w2 > 0) {
-+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(w2)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+ (uint8_t *)c->dst_addr + 8 * PW, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo2), woff_p(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), w2);
-+ }
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.y_bxx) {
-+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h + 7);
-+ FUNC(get_patch_y)(st,
-+ patch_y2, PATCH_STRIDE,
-+ st->last_l1,
-+ 16, c->h + 7);
-+
-+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][(c->mymx21 & 0xff00) != 0][(c->mymx21 & 0xff) != 0](
-+ patch_y3, patch_y1+ 3 * (PATCH_STRIDE + PW), PATCH_STRIDE,
-+ c->h, (c->mymx21 & 0xff), ((c->mymx21 >> 8) & 0xff), c->w);
-+
-+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][(c->mymx21 & 0xff000000) != 0][(c->mymx21 & 0xff0000) != 0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y2 + 3 * (PATCH_STRIDE + PW), PATCH_STRIDE, patch_y3,
-+ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+ 0, woff_b(s, c->wo2), ((c->mymx21 >> 16) & 0xff), ((c->mymx21 >> 24) & 0xff), c->w);
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.y_p00) {
-+ const qpu_mc_pred_y_p00_t *const c = &cmd->y.p00;
-+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h + 7);
-+
-+ // wo[offset] = offset*2+1
-+ s->hevcdsp.put_hevc_qpel_uni_w[wtoidx(c->w)][0][0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y1, PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo1), woff_p(s, c->wo1), 0, 0, c->w);
-+
-+ st->last_l0 = &c->next_src1;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.y_b00) {
-+ const qpu_mc_pred_y_p_t *const c = &cmd->y.p;
-+
-+ uint8_t patch_y1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_y2[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ int16_t patch_y3[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+ av_assert0(c->w <= 16 && c->h <= 64);
-+
-+ FUNC(get_patch_y)(st,
-+ patch_y1, PATCH_STRIDE,
-+ st->last_l0,
-+ 16, c->h);
-+ FUNC(get_patch_y)(st,
-+ patch_y2, PATCH_STRIDE,
-+ st->last_l1,
-+ 16, c->h);
-+
-+ s->hevcdsp.put_hevc_qpel[wtoidx(c->w)][0][0](
-+ patch_y3, patch_y1, PATCH_STRIDE,
-+ c->h, 0, 0, c->w);
-+
-+ s->hevcdsp.put_hevc_qpel_bi_w[wtoidx(c->w)][0][0](
-+ (uint8_t *)c->dst_addr, st->stride1, patch_y2, PATCH_STRIDE, patch_y3,
-+ c->h, QPU_MC_DENOM, wweight(c->wo1), wweight(c->wo2),
-+ 0, woff_b(s, c->wo2), 0, 0, c->w);
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.c_pxx) {
-+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+ const int mx = fctom(c->coeffs_x);
-+ const int my = fctom(c->coeffs_y);
-+
-+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_u3[8 * 16 * PW];
-+ uint8_t patch_v3[8 * 16 * PW];
-+
-+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+ st->last_l0 = &c->next_src;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.c_pxx_l1) {
-+ const qpu_mc_pred_c_p_t *const c = &cmd->c.p;
-+ const int mx = fctom(c->coeffs_x);
-+ const int my = fctom(c->coeffs_y);
-+
-+ uint8_t patch_u1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_v1[PATCH_STRIDE * 72]; // (Max width + 8) * (max height + 8)
-+ uint8_t patch_u3[8 * 16 * PW];
-+ uint8_t patch_v3[8 * 16 * PW];
-+
-+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_u3, 8 * PW, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo_u), woff_p(s, c->wo_u), mx, my, c->w);
-+ s->hevcdsp.put_hevc_epel_uni_w[wtoidx(c->w)][my != 0][mx != 0](
-+ patch_v3, 8 * PW, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, QPU_MC_DENOM, wweight(c->wo_v), woff_p(s, c->wo_v), mx, my, c->w);
-+
-+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+ st->last_l1 = &c->next_src;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == s->qpu.c_bxx) {
-+ const qpu_mc_pred_c_b_t *const c = &cmd->c.b;
-+ const int mx1 = fctom(c->coeffs_x1);
-+ const int my1 = fctom(c->coeffs_y1);
-+ const int mx2 = fctom(c->coeffs_x2);
-+ const int my2 = fctom(c->coeffs_y2);
-+
-+ uint8_t patch_u1[PATCH_STRIDE * 72];
-+ uint8_t patch_v1[PATCH_STRIDE * 72];
-+ uint8_t patch_u2[PATCH_STRIDE * 72];
-+ uint8_t patch_v2[PATCH_STRIDE * 72];
-+ uint8_t patch_u3[8 * 16 * PW];
-+ uint8_t patch_v3[8 * 16 * PW];
-+ uint16_t patch_u4[MAX_PB_SIZE * MAX_PB_SIZE];
-+ uint16_t patch_v4[MAX_PB_SIZE * MAX_PB_SIZE];
-+
-+ FUNC(get_patch_c)(st, patch_u1, patch_v1, PATCH_STRIDE, st->last_l0, 8+3, c->h + 3);
-+ FUNC(get_patch_c)(st, patch_u2, patch_v2, PATCH_STRIDE, st->last_l1, 8+3, c->h + 3);
-+
-+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+ patch_u4, patch_u1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, mx1, my1, c->w);
-+ s->hevcdsp.put_hevc_epel[wtoidx(c->w)][my1 != 0][mx1 != 0](
-+ patch_v4, patch_v1 + PATCH_STRIDE + PW, PATCH_STRIDE,
-+ c->h, mx1, my1, c->w);
-+
-+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+ patch_u3, 8 * PW, patch_u2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_u4,
-+ c->h, QPU_MC_DENOM, c->weight_u1, wweight(c->wo_u2),
-+ 0, woff_b(s, c->wo_u2), mx2, my2, c->w);
-+ s->hevcdsp.put_hevc_epel_bi_w[wtoidx(c->w)][my2 != 0][mx2 != 0](
-+ patch_v3, 8 * PW, patch_v2 + PATCH_STRIDE + PW, PATCH_STRIDE, patch_v4,
-+ c->h, QPU_MC_DENOM, c->weight_v1, wweight(c->wo_v2),
-+ 0, woff_b(s, c->wo_v2), mx2, my2, c->w);
-+
-+ FUNC(av_rpi_planar_to_sand_c)((uint8_t *)c->dst_addr_c, st->stride1, st->stride2, patch_u3, 8 * PW, patch_v3, 8 * PW, 0, 0, c->w * PW, c->h);
-+
-+ st->last_l0 = &c->next_src1;
-+ st->last_l1 = &c->next_src2;
-+ cmd = (const qpu_mc_pred_cmd_t *)(c + 1);
-+ }
-+ else if (link == q->code_sync) {
-+ cmd = (const qpu_mc_pred_cmd_t *)((uint32_t *)cmd + 1);
-+ break;
-+ }
-+ else if (link == q->code_exit) {
-+ // We expect exit to occur without other sync
-+ av_assert0(i == exit_n);
-+ ++exit_n;
-+ break;
-+ }
-+ else {
-+ av_assert0(0);
-+ }
-+ }
-+
-+ st->qpu_mc_curr = cmd;
-+ }
-+ } while (exit_n == 0);
-+ }
-+}
-+
-+#undef FUNC
-+#undef pixel
-+
-diff --git a/libavcodec/rpi_hevc_transform.s b/libavcodec/rpi_hevc_transform.s
-new file mode 100644
-index 0000000000..3caef20137
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform.s
-@@ -0,0 +1,444 @@
-+# ******************************************************************************
-+# Argon Design Ltd.
-+# (c) Copyright 2015 Argon Design Ltd. All rights reserved.
-+#
-+# Module : HEVC
-+# Author : Peter de Rivaz
-+# ******************************************************************************
-+
-+# USE_STACK = 1 means temporary data stored on the stack (requires build with larger stack)
-+# USE_STACK = 0 means temporary data stored in fixed per-VPU data buffers (requires modifications to vasm to handle instruction encoding for PC relative instructions)
-+.set USE_STACK, 0
-+
-+# Lines that fail to assemble start with #:
-+# The script insert_magic_opcodes.sh inserts the machine code directly for these.
-+# HEVC VPU Transform
-+#
-+# Transform matrix can be thought of as
-+# output row vector = input row vector * transMatrix2
-+#
-+# The even rows of the matrix are symmetric
-+# The odd rows of the matrix are antisymmetric
-+#
-+# So only need to compute the first half of the results, then can compute the remainder with a butterfly
-+#
-+# EXAMPLE
-+# (a b c d) (1 2 2 1)
-+# (3 4 -4 -3)
-+# (5 6 6 5)
-+# (7 8 -8 -7)
-+#
-+# x=(a c)(1 2) = 1a+5c 2a+6c
-+# (5 6)
-+#
-+# y=(b d)(3 4) = 3b+7d 4b+8d
-+# (7 8)
-+#
-+# u=x+y = 1a+5c+3b+7d 2a+4b+6c+8d
-+# v=x-y = 1a+5c-3b-7d 2a+6c-4b-8d
-+#
-+# Final results are (u , v[::-1])
-+#
-+#
-+# For 32x1 input, load even rows into HX(0++,0), odd rows into HX(16++,0)
-+# Apply the even matrix first and stop before rounding
-+# Then apply the odd matrix in a full manner:
-+#
-+# First step is to compute partial products with the first input (16 cycles)
-+# 1a 3b 5c 7d 16x1 input coefficients produce 16x16 output
-+# 2a 4b 6c 8d
-+# 2a -4b 6c -8d
-+# 1a -3b 5c -7d
-+#
-+# Second step is to sum partial products into final position (8 cycles)
-+# 1a+3b+5c+7d
-+# 2a+4b+6c+8d
-+# 2a-4b+6c-8d
-+# 1a-3b+5c-7d
-+#
-+# Then can apply butterfly to combine even results and odd results + rounding to produce 16 rows of output at a time (need to save in transposed format)
-+#
-+# For 16x16 no butterfly is required and can store final results in original location (Could do 2 16x16s in parallel to make use of the trick - saves on the adds)
-+#
-+# For 8x8 we could compute two in parallel.
-+#
-+#
-+
-+# Columns are transformed first
-+#
-+# Store top left half of transMatrix2 in
-+# Store bottom left half of transMatrix2 in HX(32,32)
-+#
-+# For 16x16
-+# HX(0:15,0) contains input data before transform
-+# HY(0:15,0) contains 32bit output data after transform
-+# HX(32,0) contains even rows of left half of transMatrix2
-+# HX(32,32) contains odd rows of left half of transMatrix2
-+# HY(48,0) contains partial products ready for summing
-+#
-+
-+
-+# hevc_trans_16x16(short *transMatrix2, short *coeffs, int num) # TODO add size so we can branch to correct implementation (or perhaps have coeffs32 and num32 as secondary inputs!)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory)
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done
-+# coeffs32
-+# num32: number of 32x32 transforms
-+# command 0 for transform, 1 for memclear16(int16_t *dst,num16)
-+#
-+
-+.equ TRANS_SHIFT, 20 - BIT_DEPTH
-+.equ TRANS_RND2, 1 << (TRANS_SHIFT - 1)
-+.equ TRANS_ASL2, 16 - TRANS_SHIFT
-+
-+
-+hevc_trans_16x16:
-+ push r6-r15, lr # TODO cut down number of used registers
-+ mov r14,r3 # coeffs32
-+ mov r15,r4 # num32
-+ mov r3, 16*2 # Stride of transMatrix2 in bytes
-+ vldh HX(32++,0),(r0 += r3) REP 16 # This is the 16x16 matrix, a transform is equivalent to multiplying input row vector * matrix
-+
-+ add r0, 16*16*2 # For 32x32 transforms we also need this matrix
-+ vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+ # Now use r0 to describe which matrix we are working on.
-+ # Allows us to prefetch the next block of coefficients for efficiency.
-+ mov r0,0 # This describes the location where we read our coefficients from
-+ mov r3,16*2 # Stride of coefficients in bytes (TODO remove)
-+ mov r7,16*16*2 # Total block size
-+ mov r8,64*16 # Value used to swap from current to next VRF location
-+ mov r4,64 # Constant used for rounding first pass
-+ mov r5,TRANS_RND2 # Constant used for rounding second pass
-+
-+ sub sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+
-+ add r11,sp,64 # Space for 32 bytes before, and rounding
-+ lsr r11,5
-+ lsl r11,5 # Make sure r11 is rounded to multiple of 2**5==32
-+
-+ lsr r10, r2, 16 # Number of compressed blocks stored in top short
-+ extu r2,16
-+ # At start of block r0,r1 point to the current block (that has already been loaded)
-+ # r0 VRF location of current block
-+ # r1 address of current block
-+ # r2 number of 16*16 transforms to do
-+ # r3 Stride of coefficients (==32)
-+ # r4 TRANS_RND1 (64)
-+ # r5 TRANS_RND2
-+ # r6 temporary used inside col_trans16
-+ # r7 16*16*2 total bytes in block
-+ # r8 64*16 VRF switch locations
-+ # r9 temporary in unpack_coeff for index
-+ # r10 number of 16x16 transforms using compression
-+ # r11 unpacked data buffer (16*16 shorts) (preceded by 16 shorts of packed data buffer)
-+ # r12 temporary counter in unpack_coeff
-+ # r13
-+ # r14 Save information for 32 bit transform (coeffs location)
-+ # r15 Save information for 32 bit transform (number of transforms)
-+ cmp r2,0
-+ beq done16x16s
-+block_loop:
-+ # With compressed coefficients, we don't use prefetch as we don't want to issue unnecessary memory requests
-+ cmp r10,0
-+ mov r6, r1
-+ beq not_compressed
-+ sub r10, 1
-+ bl unpack16x16
-+not_compressed:
-+ #mov r6,r1 # DEBUG without compress
-+ vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+ #eor r0,r8
-+ #add r1,r7
-+ # Prefetch the next block
-+ #bl unpack16x16
-+ #vldh HX(0++,0)+r0,(r6 += r3) REP 16
-+ #vmov HX(0++,0)+r0,0 REP 16 # DEBUG
-+ #eor r0,r8
-+ #sub r1,r7
-+
-+ # Transform the current block
-+ bl col_trans_16
-+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r4 REP 16 # Now add on rounding, shift down by 7, and saturate
-+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # 9+7=16 so this ends up with the output saturated and in the top half of the word.
-+ vasl HY(0++,0)+r0,HY(0++,0)+r0,9 REP 16 # This should be saturating, but the instruction above does not assemble?
-+ vmov VX(0,0++)+r0, HX(0++,32)+r0 REP 16 # For simplicity transpose this back to the original position
-+
-+ bl col_trans_16
-+ vadd HY(0++,0)+r0,HY(0++,0)+r0,r5 REP 16 # Now add on rounding, shift down by 7, and saturate
-+ #vsasls HY(0++,0)+r0,HY(0++,0)+r0,4 REP 16 # 4+12=16 so this ends up with the output saturated and in the top half of the word.
-+ vasl HY(0++,0)+r0,HY(0++,0)+r0,TRANS_ASL2 REP 16 # This should be saturating, but the instruction above does not assemble? (Probably because it ends with ls which is interpreted as a condition flag)
-+
-+ # Save results - note there has been a transposition during the processing so we save columns
-+ vsth VX(0,32++)+r0, (r1 += r3) REP 16
-+
-+ # Move onto next block
-+ eor r0,r8
-+ add r1,r7
-+
-+ addcmpbgt r2,-1,0,block_loop
-+done16x16s:
-+
-+ add sp,sp,64+16*16*2 # Move on stack pointer in case interrupt occurs and uses stack
-+ # Now go and do any 32x32 transforms
-+ b hevc_trans_32x32
-+
-+ pop r6-r15, pc
-+# This returns a value in r6 that says where to load the data from.
-+# We load data 16 shorts at a time from memory (uncached), and store to stack space to allow us to process it.
-+unpack16x16:
-+# Clear out destination
-+ vmov HX(0,0)+r0,0
-+ mov r6, r11
-+ vsth HX(0,0)+r0,(r6 += r3) REP 16
-+ mov r5, r1 # Moving pointer to input coefficients
-+unpack_outer_loop:
-+ # Loop until we find the end
-+ vldh HX(0,0)+r0,(r5) # TODO would prefetch help here while unpacking previous?
-+ sub r6,r11,32
-+ #add r6,pc,packed_data-$ # Packed data
-+ vsth HX(0,0)+r0,(r6) # Store into packed data
-+ mov r12,0
-+unpack_loop:
-+ ld r4,(r6)
-+ add r6,r6,4
-+ lsr r9,r4,16 # r9 is destination value
-+ cmp r4,0 # {value,index}
-+ extu r4,8
-+ beq done_unpack
-+ sth r9,(r11, r4)
-+ addcmpblt r12,1,8,unpack_loop
-+# # Read next 16
-+ add r5,32
-+ b unpack_outer_loop
-+done_unpack:
-+# # Set new load location
-+ mov r6, r11
-+ #add r6,pc,unpacked_data-$
-+# # Restore constants
-+ mov r4,64
-+ mov r5,TRANS_RND2
-+# pop r6-r15, pc
-+ b lr
-+
-+# r1,r2,r3 r7,r8 should be preserved
-+# HX(0++,0)+r0 is the block to be transformed
-+# HX(32++,0)+r6 is the 16x16 matrix of transform coefficients
-+# Use HY(48,0) for intermediate results
-+# r0 can be used, but should be returned to its original value at the end
-+col_trans_16:
-+ add r6,r0,16 # Final value for this loop
-+col_trans_16_loop:
-+ # First compute partial products for a single column
-+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,0++) REP 16
-+ # Then sum up the results and place back
-+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+ addcmpblt r0,1,r6,col_trans_16_loop
-+ sub r0,16 # put r0 back to its original value
-+ b lr
-+
-+col_trans_odd_16:
-+ add r6,r0,16 # Final value for this loop
-+col_trans_odd_16_loop:
-+ # First compute partial products for a single column
-+ vmul32s HY(48++,0), VX(0,0)+r0, VX(32,32++) REP 16
-+ # Then sum up the results and place back
-+ vadd VY(0,0)+r0, VY(48,0++), VY(48,8++) REP 8 CLRA SACC
-+ addcmpblt r0,1,r6,col_trans_odd_16_loop
-+ sub r0,16 # put r0 back to its original value
-+ b lr
-+
-+# r1/r10 input pointer
-+# r0,r4,r5,r6 free
-+# r8/r9 output storage
-+#
-+# Store packed coefficients at r9-32
-+# Store unpacked at r9+32*32 (because transform works on even/odd rows on input, but writes all rows)
-+unpack32x32:
-+# Clear out destination
-+ vmov HX(0,0),0
-+ add r0, r9, 32*32*2 # Unpacked buffer
-+ mov r4, 32
-+ vsth HX(0,0),(r0 += r4) REP 64
-+unpack_outer_loop32:
-+ # Loop until we find the end
-+ vldh HX(0,0),(r1) # TODO would prefetch help here while unpacking previous?
-+ sub r6,r9,32
-+ #add r6,pc,packed_data-$ # Packed data
-+ vsth HX(0,0),(r6) # Store into packed data
-+ mov r8,0
-+unpack_loop32:
-+ ld r4,(r6)
-+ add r6,r6,4
-+ lsr r5,r4,16 # r5 is destination value
-+ cmp r4,0 # {value,index}
-+ extu r4,10
-+ beq done_unpack
-+ sth r5,(r0, r4)
-+ addcmpblt r8,1,8,unpack_loop32
-+# # Read next 16
-+ add r1,32
-+ b unpack_outer_loop32
-+done_unpack32:
-+ b lr
-+# hevc_trans_32x32(short *transMatrix2, short *coeffs, int num)
-+# transMatrix2: address of the constant matrix (must be at 32 byte aligned address in Videocore memory) Even followed by odd
-+# coeffs: address of the transform coefficients (must be at 32 byte aligned address in Videocore memory)
-+# num: number of 16x16 transforms to be done in low 16, number of packed in high 16
-+#
-+# Note that the 32x32 transforms are stored in reverse order, this means that the unpacked ones appear first!
-+hevc_trans_32x32:
-+ mov r1,r14 # coeffs
-+ mov r2,r15 # num
-+ lsr r15,r15,16 # Number that are packed
-+ extu r2,16 # Total number
-+
-+ # Fetch odd transform matrix
-+ #mov r3, 16*2 # Stride of transMatrix2 in bytes (and of coefficients)
-+ #vldh HX(32++,0),(r0 += r3) REP 16 # This is the even 16x16 matrix
-+ #add r0, 16*16*2
-+ #vldh HX(32++,32),(r0 += r3) REP 16 # This is the odd 16x16 matrix
-+
-+ mov r3, 32*2*2 # Stride used to fetch alternate rows of our input coefficient buffer
-+ mov r7, 16*16*2 # Total block size
-+
-+.if USE_STACK
-+ # Stack base allocation
-+ sub sp,sp,32*32*4+64 # Allocate some space on the stack for us to store 32*32 shorts as temporary results (needs to be aligned) and another 32*32 for unpacking
-+ # set r8 to 32byte aligned stack pointer with 32 bytes of space before it
-+ add r8,sp,63
-+ lsr r8,5
-+ lsl r8,5
-+.else
-+#:version r8
-+ .half 0x00e8 #AUTOINSERTED
-+ btst r8,16
-+#:add r8,pc,intermediate_results-$
-+ .half 0xbfe8
-+ .half intermediate_results-($-2)
-+ beq on_vpu1
-+ add r8,r8,32*32*2*2+16*2 # Move to secondary storage
-+on_vpu1:
-+.endif
-+ mov r9,r8 # Backup of the temporary storage
-+ mov r10,r1 # Backup of the coefficient buffer
-+
-+ cmp r2,0
-+ beq done32x32s
-+block_loop32:
-+
-+ # Transform the first 16 columns
-+ mov r1,r10 # Input Coefficient buffer
-+ mov r8,r9 # Output temporary storage
-+ # Unpacked are first, so need to only do unpacking when r2(=num left) <= r15 (=num packed)
-+ cmp r2,r15
-+ bgt not_compressed_32
-+ bl unpack32x32
-+ add r1,r9,32*32*2 # Uncompressed into temporary storage
-+ mov r8,r9 # Transform into here
-+not_compressed_32:
-+ # COLUMN TRANSFORM
-+ mov r4, 64 # Constant used for rounding first pass
-+ mov r5, 9 # left shift used for rounding first pass
-+
-+ bl trans32
-+ # Transform the second 16 columns
-+ add r8,32*16*2
-+ add r1,32
-+ bl trans32
-+
-+ # ROW TRANSFORM
-+ mov r4, TRANS_RND2 # Constant used for rounding second pass
-+ mov r5, TRANS_ASL2 # left shift used for rounding second pass
-+
-+ mov r1,r9 # Input temporary storage
-+ mov r8,r10 # Output Coefficient buffer
-+ bl trans32
-+ # Transform the second 16 columns
-+ add r8,32*16*2
-+ add r1,32
-+ bl trans32
-+
-+ add r10, 32*32*2 # move onto next block of coefficients
-+ addcmpbgt r2,-1,0,block_loop32
-+done32x32s:
-+
-+.if USE_STACK
-+ add sp,sp,32*32*4+64# Restore stack
-+.endif
-+
-+ pop r6-r15, pc
-+
-+trans32:
-+ push lr
-+ # We can no longer afford the VRF space to do prefetching when doing 32x32
-+ # Fetch the even rows
-+ vldh HX(0++,0),(r1 += r3) REP 16
-+ # Fetch the odd rows
-+ vldh HX(16++,0),64(r1 += r3) REP 16 # First odd row is 32 shorts ahead of r1
-+
-+ # Transform the even rows using even matrix
-+ mov r0, 0 # Even rows
-+ bl col_trans_16
-+
-+ # Now transform the odd rows using odd matrix
-+ mov r0, 64*16 # Odd rows
-+ bl col_trans_odd_16
-+
-+ # Now apply butterfly to compute the first 16 results
-+ vadd HY(48++,0),HY(0++,0),HY(16++,0) REP 16
-+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
-+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
-+ # 16bit results now in HX(48,32)
-+ mov r0,r8
-+ mov r6,32*2
-+ vsth VX(48,32++),(r0+=r6) REP 16
-+
-+ # Now apply butterfly to compute the second 16 results (in reverse order)
-+ vsub HY(63,0),HY(0 ,0),HY(16,0)
-+ vsub HY(62,0),HY(1 ,0),HY(17,0)
-+ vsub HY(61,0),HY(2 ,0),HY(18,0)
-+ vsub HY(60,0),HY(3 ,0),HY(19,0)
-+ vsub HY(59,0),HY(4 ,0),HY(20,0)
-+ vsub HY(58,0),HY(5 ,0),HY(21,0)
-+ vsub HY(57,0),HY(6 ,0),HY(22,0)
-+ vsub HY(56,0),HY(7 ,0),HY(23,0)
-+ vsub HY(55,0),HY(8 ,0),HY(24,0)
-+ vsub HY(54,0),HY(9 ,0),HY(25,0)
-+ vsub HY(53,0),HY(10,0),HY(26,0)
-+ vsub HY(52,0),HY(11,0),HY(27,0)
-+ vsub HY(51,0),HY(12,0),HY(28,0)
-+ vsub HY(50,0),HY(13,0),HY(29,0)
-+ vsub HY(49,0),HY(14,0),HY(30,0)
-+ vsub HY(48,0),HY(15,0),HY(31,0)
-+ vadd HY(48++,0),HY(48++,0),r4 REP 16 # add on rounding,
-+ vasl HY(48++,0),HY(48++,0),r5 REP 16 # shift down by 7, and saturate
-+ add r0,r8,32
-+ vsth VX(48,32++),(r0+=r6) REP 16
-+ pop pc
-+
-+.if USE_STACK == 0
-+ .balign 32
-+
-+# .space directives generate 0's in the bin so avoid unnecessary padding by
-+# just setting to appropriate value
-+.equ intermediate_results, $+16*2
-+
-+# Layout goes:
-+#
-+#packed_buffer:
-+# .space 16*2
-+#intermediate_results:
-+# .space 32*32*2
-+#unpacked_buffer:
-+# .space 32*32*2
-+#
-+#packed_buffer2:
-+# .space 16*2
-+#intermediate_results2:
-+# .space 32*32*2
-+#unpacked_buffer2:
-+# .space 32*32*2
-+.endif
-+
-+
-diff --git a/libavcodec/rpi_hevc_transform10.h b/libavcodec/rpi_hevc_transform10.h
-new file mode 100644
-index 0000000000..1c364492d0
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform10.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform10 [] = {
-+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
-+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
-+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
-+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
-+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
-+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
-+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x02, // 0030
-+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
-+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
-+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
-+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
-+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
-+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
-+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
-+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
-+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
-+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
-+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
-+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x06, 0x04, // 0090
-+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
-+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
-+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
-+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
-+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
-+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
-+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
-+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
-+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
-+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
-+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
-+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
-+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
-+0x00, 0x02, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
-+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
-+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
-+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
-+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
-+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
-+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
-+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
-+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
-+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
-+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
-+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
-+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
-+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
-+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
-+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
-+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
-+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
-+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
-+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
-+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
-+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
-+0x04, 0xb0, 0x00, 0x02, 0x65, 0x60, 0x91, 0x40, // 01d8
-+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
-+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
-+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
-+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
-+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
-+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
-+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
-+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
-+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
-+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
-+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
-+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
-+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
-+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
-+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
-+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
-+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
-+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
-+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
-+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
-+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
-+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
-+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
-+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
-+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
-+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
-+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
-+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
-+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
-+};
-diff --git a/libavcodec/rpi_hevc_transform8.h b/libavcodec/rpi_hevc_transform8.h
-new file mode 100644
-index 0000000000..1128a2c054
---- /dev/null
-+++ b/libavcodec/rpi_hevc_transform8.h
-@@ -0,0 +1,94 @@
-+static const unsigned char rpi_hevc_transform8 [] = {
-+0xa9, 0x03, 0x3e, 0x40, 0x4f, 0x40, 0x03, 0xb0, // 0000
-+0x20, 0x00, 0x0c, 0xf8, 0x38, 0x88, 0x80, 0x03, // 0008
-+0xc0, 0xf8, 0x00, 0x00, 0x40, 0xb0, 0x00, 0x02, // 0010
-+0x0c, 0xf8, 0x38, 0xa8, 0x80, 0x03, 0xc0, 0xf8, // 0018
-+0x00, 0x00, 0x00, 0x60, 0x03, 0xb0, 0x20, 0x00, // 0020
-+0x07, 0xb0, 0x00, 0x02, 0x08, 0xb0, 0x00, 0x04, // 0028
-+0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, 0x00, 0x08, // 0030
-+0x59, 0xb0, 0xc0, 0xfd, 0x0b, 0x12, 0x5b, 0x7a, // 0038
-+0x5b, 0x7c, 0x4a, 0xc3, 0x50, 0x17, 0x02, 0x6f, // 0040
-+0x02, 0x6a, 0x32, 0x18, 0x0a, 0x6a, 0x16, 0x40, // 0048
-+0x04, 0x18, 0x1a, 0x66, 0x80, 0x90, 0x32, 0x00, // 0050
-+0x0c, 0xf8, 0x38, 0x80, 0x80, 0x03, 0xc0, 0x08, // 0058
-+0x18, 0x00, 0x80, 0x90, 0x51, 0x00, 0x04, 0xff, // 0060
-+0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, 0x10, 0x00, // 0068
-+0x4c, 0xfe, 0x30, 0xc0, 0x09, 0x04, 0x20, 0x08, // 0070
-+0x00, 0x00, 0x04, 0xfc, 0x38, 0x90, 0x80, 0x02, // 0078
-+0xc0, 0x0b, 0x02, 0x00, 0x80, 0x90, 0x40, 0x00, // 0080
-+0x04, 0xff, 0x30, 0xc0, 0x80, 0x03, 0x20, 0x08, // 0088
-+0x14, 0x00, 0x4c, 0xfe, 0x30, 0xc0, 0x04, 0x04, // 0090
-+0x20, 0x08, 0x00, 0x00, 0x8c, 0xf8, 0x2c, 0xe0, // 0098
-+0x80, 0x03, 0x20, 0x30, 0x04, 0x00, 0x80, 0x45, // 00a0
-+0x71, 0x42, 0xf2, 0x8c, 0xd1, 0xc0, 0x59, 0xb0, // 00a8
-+0x40, 0x02, 0x00, 0x9e, 0x6d, 0x00, 0x29, 0x03, // 00b0
-+0x00, 0xf4, 0x38, 0x80, 0x00, 0x0c, 0xb6, 0x40, // 00b8
-+0x8c, 0xf8, 0x20, 0xe0, 0x80, 0x03, 0x00, 0x30, // 00c0
-+0x18, 0x00, 0x15, 0x40, 0x08, 0xf0, 0x38, 0x80, // 00c8
-+0x85, 0x0b, 0x66, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 00d0
-+0x24, 0xe0, 0x86, 0x03, 0x0c, 0x60, 0x64, 0x08, // 00d8
-+0x46, 0x62, 0x49, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 00e0
-+0x84, 0x6e, 0x07, 0x18, 0x69, 0xa0, 0x04, 0x5f, // 00e8
-+0x1c, 0x8b, 0xf7, 0xc8, 0x45, 0x76, 0x6b, 0x1f, // 00f0
-+0xb6, 0x40, 0x04, 0xb0, 0x40, 0x00, 0x05, 0xb0, // 00f8
-+0x00, 0x08, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0100
-+0xa4, 0xff, 0x24, 0xcc, 0x60, 0x02, 0x00, 0xf8, // 0108
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0110
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0118
-+0x00, 0x67, 0x5a, 0x00, 0x06, 0xb4, 0x10, 0x00, // 0120
-+0xa4, 0xff, 0x24, 0xcc, 0xe0, 0x02, 0x00, 0xf8, // 0128
-+0x3e, 0x00, 0x03, 0xff, 0x37, 0xd0, 0x78, 0x03, // 0130
-+0xe0, 0x03, 0xbe, 0x0b, 0x10, 0x8b, 0xf6, 0x5b, // 0138
-+0x00, 0x67, 0x5a, 0x00, 0x00, 0xf4, 0x38, 0x80, // 0140
-+0x00, 0x04, 0x20, 0xb5, 0x00, 0x08, 0x04, 0xb0, // 0148
-+0x20, 0x00, 0x8e, 0xf8, 0x20, 0xe0, 0x80, 0x03, // 0150
-+0xc0, 0x43, 0x00, 0x00, 0x08, 0xf0, 0x38, 0x80, // 0158
-+0x81, 0x03, 0x26, 0xb5, 0xe0, 0xff, 0x88, 0xf0, // 0160
-+0x20, 0xe0, 0x86, 0x03, 0x08, 0x60, 0x64, 0x08, // 0168
-+0x46, 0x62, 0x45, 0xc3, 0x50, 0x27, 0x04, 0x6a, // 0170
-+0xa4, 0x6e, 0x7f, 0x90, 0xbf, 0xff, 0x65, 0xa0, // 0178
-+0x04, 0x07, 0x18, 0x8b, 0xf6, 0xc8, 0x41, 0x76, // 0180
-+0x6a, 0x1f, 0x5a, 0x00, 0xe1, 0x40, 0xf2, 0x40, // 0188
-+0x0f, 0x7b, 0x02, 0x6f, 0x03, 0xb0, 0x80, 0x00, // 0190
-+0x07, 0xb0, 0x00, 0x02, 0xe8, 0x00, 0x08, 0x6d, // 0198
-+0xe8, 0xbf, 0x60, 0x01, 0x03, 0x18, 0x48, 0xb0, // 01a0
-+0x20, 0x10, 0x89, 0x40, 0x1a, 0x40, 0x02, 0x6a, // 01a8
-+0x24, 0x18, 0xa1, 0x40, 0x98, 0x40, 0xf2, 0x4a, // 01b0
-+0x06, 0x1e, 0xff, 0x9f, 0xc5, 0xff, 0x21, 0xb5, // 01b8
-+0x00, 0x08, 0x98, 0x40, 0x04, 0xb0, 0x40, 0x00, // 01c0
-+0x95, 0x60, 0x80, 0x90, 0x18, 0x00, 0x48, 0xb0, // 01c8
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x13, 0x00, // 01d0
-+0x04, 0xb0, 0x00, 0x08, 0x45, 0x60, 0x91, 0x40, // 01d8
-+0xa8, 0x40, 0x80, 0x90, 0x0c, 0x00, 0x48, 0xb0, // 01e0
-+0x00, 0x04, 0x41, 0x76, 0x80, 0x90, 0x07, 0x00, // 01e8
-+0x4a, 0xb0, 0x00, 0x08, 0xf2, 0x8c, 0xdf, 0xc0, // 01f0
-+0x29, 0x03, 0xef, 0x03, 0x0c, 0xf8, 0x38, 0x80, // 01f8
-+0x80, 0x03, 0xc0, 0xf8, 0x04, 0x00, 0x0c, 0xf8, // 0200
-+0x38, 0x84, 0xc0, 0x03, 0xc0, 0xf8, 0x04, 0x00, // 0208
-+0x00, 0x60, 0xff, 0x9f, 0x79, 0xff, 0x00, 0xb0, // 0210
-+0x00, 0x04, 0xff, 0x9f, 0x85, 0xff, 0x04, 0xff, // 0218
-+0x30, 0xcc, 0x10, 0x03, 0xe0, 0xfb, 0x3e, 0x00, // 0220
-+0x04, 0xff, 0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, // 0228
-+0x10, 0x00, 0x4c, 0xfe, 0x33, 0xcc, 0x80, 0x03, // 0230
-+0xe0, 0xfb, 0x14, 0x00, 0x80, 0x40, 0x06, 0xb0, // 0238
-+0x40, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, 0x80, 0x03, // 0240
-+0xe0, 0x63, 0x00, 0x00, 0x20, 0xf7, 0xf0, 0xcf, // 0248
-+0x10, 0x03, 0x20, 0xf7, 0xb0, 0xcf, 0x11, 0x13, // 0250
-+0x20, 0xf7, 0x70, 0xcf, 0x12, 0x23, 0x20, 0xf7, // 0258
-+0x30, 0xcf, 0x13, 0x33, 0x20, 0xf7, 0xf0, 0xce, // 0260
-+0x14, 0x43, 0x20, 0xf7, 0xb0, 0xce, 0x15, 0x53, // 0268
-+0x20, 0xf7, 0x70, 0xce, 0x16, 0x63, 0x20, 0xf7, // 0270
-+0x30, 0xce, 0x17, 0x73, 0x20, 0xf7, 0xf0, 0xcd, // 0278
-+0x18, 0x83, 0x20, 0xf7, 0xb0, 0xcd, 0x19, 0x93, // 0280
-+0x20, 0xf7, 0x70, 0xcd, 0x1a, 0xa3, 0x20, 0xf7, // 0288
-+0x30, 0xcd, 0x1b, 0xb3, 0x20, 0xf7, 0xf0, 0xcc, // 0290
-+0x1c, 0xc3, 0x20, 0xf7, 0xb0, 0xcc, 0x1d, 0xd3, // 0298
-+0x20, 0xf7, 0x70, 0xcc, 0x1e, 0xe3, 0x20, 0xf7, // 02a0
-+0x30, 0xcc, 0x1f, 0xf3, 0x04, 0xff, 0x33, 0xcc, // 02a8
-+0x80, 0x03, 0xe0, 0xfb, 0x10, 0x00, 0x4c, 0xfe, // 02b0
-+0x33, 0xcc, 0x80, 0x03, 0xe0, 0xfb, 0x14, 0x00, // 02b8
-+0x00, 0xb5, 0x20, 0x00, 0x8c, 0xf8, 0x2f, 0xe0, // 02c0
-+0x80, 0x03, 0xe0, 0x63, 0x00, 0x00, 0x6f, 0x03, // 02c8
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d0
-+0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // 02d8
-+};
-diff --git a/libavcodec/rpi_hevcdec.c b/libavcodec/rpi_hevcdec.c
-new file mode 100644
-index 0000000000..39a63c77de
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.c
-@@ -0,0 +1,6016 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2012 - 2013 Mickael Raulet
-+ * Copyright (C) 2012 - 2013 Gildas Cocherel
-+ * Copyright (C) 2012 - 2013 Wassim Hamidouche
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "libavutil/attributes.h"
-+#include "libavutil/common.h"
-+#include "libavutil/display.h"
-+#include "libavutil/internal.h"
-+#include "libavutil/mastering_display_metadata.h"
-+#include "libavutil/md5.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/stereo3d.h"
-+
-+#include "bswapdsp.h"
-+#include "bytestream.h"
-+#include "golomb.h"
-+#include "hevc.h"
-+#include "rpi_hevc_data.h"
-+#include "rpi_hevc_parse.h"
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevc_cabac_fns.h"
-+#include "profiles.h"
-+#include "hwaccel.h"
-+
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_shader_cmd.h"
-+#include "rpi_hevc_shader_template.h"
-+#include "rpi_zc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "pthread.h"
-+#include <stdatomic.h>
-+
-+#define DEBUG_DECODE_N 0 // 0 = do all, n = frames idr onwards
-+
-+#define PACK2(hi,lo) (((hi) << 16) | ((lo) & 0xffff))
-+
-+#ifndef av_mod_uintp2
-+static av_always_inline av_const unsigned av_mod_uintp2_c(unsigned a, unsigned p)
-+{
-+ return a & ((1 << p) - 1);
-+}
-+# define av_mod_uintp2 av_mod_uintp2_c
-+#endif
-+
-+const uint8_t ff_hevc_rpi_pel_weight[65] = { [2] = 0, [4] = 1, [6] = 2, [8] = 3, [12] = 4, [16] = 5, [24] = 6, [32] = 7, [48] = 8, [64] = 9 };
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first);
-+
-+#define MC_DUMMY_X (-32)
-+#define MC_DUMMY_Y (-32)
-+
-+// UV & Y both have min 4x4 pred (no 2x2 chroma)
-+// Allow for even spread +1 for setup, +1 for rounding
-+// As we have load sharing this can (in theory) be exceeded so we have to
-+// check after each CTU, but it is a good base size
-+
-+// Worst case (all 4x4) commands per CTU
-+#define QPU_Y_CMD_PER_CTU_MAX (16 * 16)
-+#define QPU_C_CMD_PER_CTU_MAX (8 * 8)
-+
-+#define QPU_MAX_CTU_PER_LINE ((HEVC_RPI_MAX_WIDTH + 63) / 64)
-+
-+#define QPU_GRPS (QPU_N_MAX / QPU_N_GRP)
-+#define QPU_CTU_PER_GRP ((QPU_MAX_CTU_PER_LINE + QPU_GRPS - 1) / QPU_GRPS)
-+
-+#define QPU_Y_CMD_SLACK_PER_Q (QPU_Y_CMD_PER_CTU_MAX / 2)
-+#define QPU_C_CMD_SLACK_PER_Q (QPU_C_CMD_PER_CTU_MAX / 2)
-+
-+// Total cmds to allocate - allow for slack & setup
-+#define QPU_Y_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_Y_CMD_PER_CTU_MAX + (1 + QPU_Y_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+#define QPU_C_COMMANDS (QPU_CTU_PER_GRP * QPU_GRPS * QPU_C_CMD_PER_CTU_MAX + (1 + QPU_C_CMD_SLACK_PER_Q) * QPU_N_MAX)
-+
-+#define QPU_Y_SYNCS (QPU_N_MAX * (16 + 2))
-+#define QPU_C_SYNCS (QPU_N_MAX * (8 + 2))
-+
-+// The QPU code for UV blocks only works up to a block width of 8
-+#define RPI_CHROMA_BLOCK_WIDTH 8
-+
-+#define ENCODE_COEFFS(c0, c1, c2, c3) (((c0) & 0xff) | ((c1) & 0xff) << 8 | ((c2) & 0xff) << 16 | ((c3) & 0xff) << 24)
-+
-+
-+// Actual filter goes -ve, +ve, +ve, -ve using these values
-+static const uint32_t rpi_filter_coefs[8] = {
-+ ENCODE_COEFFS( 0, 64, 0, 0),
-+ ENCODE_COEFFS( 2, 58, 10, 2),
-+ ENCODE_COEFFS( 4, 54, 16, 2),
-+ ENCODE_COEFFS( 6, 46, 28, 4),
-+ ENCODE_COEFFS( 4, 36, 36, 4),
-+ ENCODE_COEFFS( 4, 28, 46, 6),
-+ ENCODE_COEFFS( 2, 16, 54, 4),
-+ ENCODE_COEFFS( 2, 10, 58, 2)
-+};
-+
-+// Function arrays by QPU
-+
-+static const int * const inter_pred_setup_c_qpu[12] = {
-+ mc_setup_c_q0, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn,
-+ mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn, mc_setup_c_qn
-+};
-+
-+static const int * const inter_pred_setup_c10_qpu[12] = {
-+ mc_setup_c10_q0, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn,
-+ mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn, mc_setup_c10_qn
-+};
-+
-+static const int * const inter_pred_setup_y_qpu[12] = {
-+ mc_setup_y_q0, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn,
-+ mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn, mc_setup_y_qn
-+};
-+
-+static const int * const inter_pred_setup_y10_qpu[12] = {
-+ mc_setup_y10_q0, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn,
-+ mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn, mc_setup_y10_qn
-+};
-+
-+static const int * const inter_pred_sync_qpu[12] = {
-+ mc_sync_q0, mc_sync_q1, mc_sync_q2, mc_sync_q3,
-+ mc_sync_q4, mc_sync_q5, mc_sync_q6, mc_sync_q7,
-+ mc_sync_q8, mc_sync_q9, mc_sync_q10, mc_sync_q11
-+};
-+
-+static const int * const inter_pred_sync10_qpu[12] = {
-+ mc_sync10_q0, mc_sync10_q1, mc_sync10_q2, mc_sync10_q3,
-+ mc_sync10_q4, mc_sync10_q5, mc_sync10_q6, mc_sync10_q7,
-+ mc_sync10_q8, mc_sync10_q9, mc_sync10_q10, mc_sync10_q11
-+};
-+
-+static const int * const inter_pred_exit_c_qpu[12] = {
-+ mc_exit_c_q0, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn,
-+ mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn, mc_exit_c_qn
-+};
-+
-+static const int * const inter_pred_exit_c10_qpu[12] = {
-+ mc_exit_c10_q0, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn,
-+ mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn, mc_exit_c10_qn
-+};
-+
-+static const int * const inter_pred_exit_y_qpu[12] = {
-+ mc_exit_y_q0, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn,
-+ mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn, mc_exit_y_qn
-+};
-+
-+static const int * const inter_pred_exit_y10_qpu[12] = {
-+ mc_exit_y10_q0, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn,
-+ mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn, mc_exit_y10_qn
-+};
-+
-+typedef struct ipe_chan_info_s
-+{
-+ const uint8_t bit_depth;
-+ const uint8_t n;
-+ const int * const * setup_fns;
-+ const int * const * sync_fns;
-+ const int * const * exit_fns;
-+} ipe_chan_info_t;
-+
-+typedef struct ipe_init_info_s
-+{
-+ ipe_chan_info_t luma;
-+ ipe_chan_info_t chroma;
-+} ipe_init_info_t;
-+
-+static void set_bytes(uint8_t * b, const unsigned int stride, const int ln, unsigned int a)
-+{
-+ switch (ln)
-+ {
-+ default: // normally 0
-+ *b = a;
-+ break;
-+ case 1:
-+ a |= a << 8;
-+ *(uint16_t *)b = a;
-+ b += stride;
-+ *(uint16_t *)b = a;
-+ break;
-+ case 2:
-+ a |= a << 8;
-+ a |= a << 16;
-+ *(uint32_t *)b = a;
-+ b += stride;
-+ *(uint32_t *)b = a;
-+ b += stride;
-+ *(uint32_t *)b = a;
-+ b += stride;
-+ *(uint32_t *)b = a;
-+ break;
-+ case 3:
-+ {
-+ unsigned int i;
-+ uint64_t d;
-+ a |= a << 8;
-+ a |= a << 16;
-+ d = ((uint64_t)a << 32) | a;
-+ for (i = 0; i != 8; ++i, b += stride)
-+ *(uint64_t *)b = d;
-+ break;
-+ }
-+ case 4:
-+ {
-+ unsigned int i;
-+ uint64_t d;
-+ a |= a << 8;
-+ a |= a << 16;
-+ d = ((uint64_t)a << 32) | a;
-+ for (i = 0; i != 16; ++i, b += stride)
-+ {
-+ *(uint64_t *)b = d;
-+ *(uint64_t *)(b + 8) = d;
-+ }
-+ break;
-+ }
-+ }
-+}
-+
-+// We expect this to be called with ln = (log2_cb_size - 3) so range = -1..3
-+// (4 not required)
-+static void set_stash2(uint8_t * b_u, uint8_t * b_l, const int ln, unsigned int a)
-+{
-+ switch (ln)
-+ {
-+ default: // 0 or -1
-+ *b_u = a;
-+ *b_l = a;
-+ break;
-+ case 1:
-+ a |= a << 8;
-+ *(uint16_t *)b_u = a;
-+ *(uint16_t *)b_l = a;
-+ break;
-+ case 2:
-+ a |= a << 8;
-+ a |= a << 16;
-+ *(uint32_t *)b_u = a;
-+ *(uint32_t *)b_l = a;
-+ break;
-+ case 3:
-+ a |= a << 8;
-+ a |= a << 16;
-+ *(uint32_t *)b_u = a;
-+ *(uint32_t *)(b_u + 4) = a;
-+ *(uint32_t *)b_l = a;
-+ *(uint32_t *)(b_l + 4) = a;
-+ break;
-+ case 4:
-+ a |= a << 8;
-+ a |= a << 16;
-+ *(uint32_t *)b_u = a;
-+ *(uint32_t *)(b_u + 4) = a;
-+ *(uint32_t *)(b_u + 8) = a;
-+ *(uint32_t *)(b_u + 12) = a;
-+ *(uint32_t *)b_l = a;
-+ *(uint32_t *)(b_l + 4) = a;
-+ *(uint32_t *)(b_l + 8) = a;
-+ *(uint32_t *)(b_l + 12) = a;
-+ break;
-+ }
-+}
-+
-+static void zap_cabac_stash(uint8_t * b, const int ln)
-+{
-+ switch (ln)
-+ {
-+ default: // 0
-+ *b = 0;
-+ break;
-+ case 1:
-+ *(uint16_t *)b = 0;
-+ break;
-+ case 2:
-+ *(uint32_t *)b = 0;
-+ break;
-+ case 3:
-+ *(uint32_t *)b = 0;
-+ *(uint32_t *)(b + 4) = 0;
-+ break;
-+ }
-+}
-+
-+
-+
-+// Set a small square block of bits in a bitmap
-+// Bits must be aligned on their size boundry (which will be true of all split CBs)
-+static void set_bits(uint8_t * f, const unsigned int x, const unsigned int stride, const unsigned int ln)
-+{
-+ unsigned int n;
-+ const unsigned int sh = (x & 7);
-+
-+ f += (x >> 3);
-+
-+ av_assert2(ln <= 3);
-+ av_assert2((x & ((1 << ln) - 1)) == 0);
-+
-+ switch (ln)
-+ {
-+ default: // 1
-+ f[0] |= 1 << sh;
-+ break;
-+ case 1: // 3 * 2
-+ n = 3 << sh;
-+ f[0] |= n;
-+ f[stride] |= n;
-+ break;
-+ case 2: // 0xf * 4
-+ n = 0xf << sh;
-+ f[0] |= n;
-+ f[stride] |= n;
-+ f[stride * 2] |= n;
-+ f[stride * 3] |= n;
-+ break;
-+ case 3: // 0xff * 8
-+ for (n = 0; n != 8; ++n, f += stride)
-+ *f = 0xff;
-+ break;
-+ }
-+}
-+
-+static const ipe_init_info_t ipe_init_infos[9] = { // Alloc for bit depths of 8-16
-+ { // 8
-+ .luma = {8, QPU_MC_PRED_N_Y8, inter_pred_setup_y_qpu, inter_pred_sync_qpu, inter_pred_exit_y_qpu},
-+ .chroma = {8, QPU_MC_PRED_N_C8, inter_pred_setup_c_qpu, inter_pred_sync_qpu, inter_pred_exit_c_qpu}
-+ },
-+ { // 9
-+ .luma = {0},
-+ .chroma = {0}
-+ },
-+ { // 10
-+ .luma = {10, QPU_MC_PRED_N_Y10, inter_pred_setup_y10_qpu, inter_pred_sync10_qpu, inter_pred_exit_y10_qpu},
-+ .chroma = {10, QPU_MC_PRED_N_C10, inter_pred_setup_c10_qpu, inter_pred_sync10_qpu, inter_pred_exit_c10_qpu}
-+ }
-+
-+};
-+
-+static void set_ipe_from_ici(HEVCRpiInterPredEnv * const ipe, const ipe_chan_info_t * const ici)
-+{
-+ const unsigned int n = ici->n;
-+ const unsigned int q1_size = (ipe->gptr.numbytes / n) & ~3; // Round down to word
-+
-+ ipe->n = n;
-+ ipe->max_fill = q1_size - ipe->min_gap;
-+ for(unsigned int i = 0; i < n; i++) {
-+ HEVCRpiInterPredQ * const q = ipe->q + i;
-+ q->qpu_mc_curr = q->qpu_mc_base =
-+ (qpu_mc_pred_cmd_t *)(ipe->gptr.arm + i * q1_size);
-+ q->code_setup = qpu_fn(ici->setup_fns[i]);
-+ q->code_sync = qpu_fn(ici->sync_fns[i]);
-+ q->code_exit = qpu_fn(ici->exit_fns[i]);
-+ }
-+}
-+
-+static void rpi_hevc_qpu_set_fns(HEVCRpiContext * const s, const unsigned int bit_depth)
-+{
-+ av_assert0(bit_depth >= 8 && bit_depth <= 16);
-+
-+ rpi_hevc_qpu_init_fn(&s->qpu, bit_depth);
-+}
-+
-+// Unsigned Trivial MOD
-+static inline unsigned int utmod(const unsigned int x, const unsigned int n)
-+{
-+ return x >= n ? x - n : x;
-+}
-+
-+// returns pq->job_n++
-+static inline unsigned int pass_queue_inc_job_n(HEVCRpiPassQueue * const pq)
-+{
-+ unsigned int const x2 = pq->job_n;
-+ pq->job_n = utmod(x2 + 1, RPI_MAX_JOBS);
-+ return x2;
-+}
-+
-+static void pass_queue_init(HEVCRpiPassQueue * const pq, HEVCRpiContext * const s, HEVCRpiWorkerFn * const worker, sem_t * const psem_out, const int n)
-+{
-+ pq->terminate = 0;
-+ pq->job_n = 0;
-+ pq->context = s;
-+ pq->worker = worker;
-+ pq->psem_out = psem_out;
-+ pq->pass_n = n;
-+ pq->started = 0;
-+ sem_init(&pq->sem_in, 0, 0);
-+}
-+
-+static void pass_queue_kill(HEVCRpiPassQueue * const pq)
-+{
-+ sem_destroy(&pq->sem_in);
-+}
-+
-+static inline void rpi_sem_wait(sem_t * const sem)
-+{
-+ while (sem_wait(sem) != 0) {
-+ av_assert0(errno == EINTR);
-+ }
-+}
-+
-+static void pass_queue_submit_job(HEVCRpiPassQueue * const pq)
-+{
-+ sem_post(&pq->sem_in);
-+}
-+
-+static inline void pass_queue_do_all(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+ // Do the various passes - common with the worker code
-+ for (unsigned int i = 0; i != RPI_PASSES; ++i) {
-+ s->passq[i].worker(s, jb);
-+ }
-+}
-+
-+
-+#if 0
-+static void dump_jbc(const HEVCRpiJobCtl *const jbc, const char * const func)
-+{
-+ int x;
-+ sem_getvalue((sem_t *)&jbc->sem_out, &x);
-+ printf("%s: jbc: in=%d, out=%d, sum=%d\n", func, jbc->offload_in, jbc->offload_out, x);
-+}
-+#endif
-+
-+
-+static HEVCRpiJob * job_alloc(HEVCRpiJobCtl * const jbc, HEVCRpiLocalContext * const lc)
-+{
-+ HEVCRpiJob * jb;
-+ HEVCRpiJobGlobal * const jbg = jbc->jbg;
-+
-+ pthread_mutex_lock(&jbg->lock);
-+ // Check local 1st
-+ if ((jb = jbc->jb1) != NULL)
-+ {
-+ // Only 1 - very easy :-)
-+ jbc->jb1 = NULL;
-+ }
-+ else
-+ {
-+ // Now look for global free chain
-+ if ((jb = jbg->free1) != NULL)
-+ {
-+ // Found one - unlink it
-+ jbg->free1 = jb->next;
-+ jb->next = NULL;
-+ }
-+ else
-+ {
-+ // Out of places to look - wait for one to become free - add to Qs
-+
-+ // Global
-+ // If "good" lc then add after the last "good" el in the chain
-+ // otherwise add to the tail
-+ if (jbg->wait_tail == NULL || jbg->wait_tail->last_progress_good || !lc->last_progress_good)
-+ {
-+ // Add to end as we had to wait last time or wait Q empty
-+ if ((lc->jw_prev = jbg->wait_tail) == NULL)
-+ jbg->wait_head = lc;
-+ else
-+ lc->jw_prev->jw_next = lc;
-+ lc->jw_next = NULL;
-+ jbg->wait_tail = lc;
-+ }
-+ else
-+ {
-+ // This is a "good" lc that we need to poke into the middle
-+ // of the Q
-+ // We know that the Q isn't empty and there is at least one
-+ // !last_progess_good el in it from the previous test
-+
-+ HEVCRpiLocalContext * const p = jbg->wait_good; // Insert after
-+
-+ if (p == NULL)
-+ {
-+ // No current good els - add to head
-+ lc->jw_next = jbg->wait_head;
-+ jbg->wait_head = lc;
-+ }
-+ else
-+ {
-+ lc->jw_next = p->jw_next;
-+ p->jw_next = lc;
-+ }
-+
-+ lc->jw_next->jw_prev = lc;
-+ lc->jw_prev = p;
-+ }
-+
-+ // If "good" then we are now the last good waiting el
-+ if (lc->last_progress_good)
-+ jbg->wait_good = lc;
-+
-+ // Local
-+ if ((lc->ljw_prev = jbc->lcw_tail) == NULL)
-+ jbc->lcw_head = lc;
-+ else
-+ lc->ljw_prev->ljw_next = lc;
-+ lc->ljw_next = NULL;
-+ jbc->lcw_tail = lc;
-+ }
-+ }
-+
-+ pthread_mutex_unlock(&jbg->lock);
-+
-+ if (jb == NULL) // Need to wait
-+ {
-+ rpi_sem_wait(&lc->jw_sem);
-+ jb = lc->jw_job; // Set by free code
-+ }
-+
-+ return jb;
-+}
-+
-+
-+static void job_free(HEVCRpiJobCtl * const jbc0, HEVCRpiJob * const jb)
-+{
-+ HEVCRpiJobGlobal * const jbg = jbc0->jbg; // This jbc only used to find jbg so we can get the lock
-+ HEVCRpiJobCtl * jbc = jb->jbc_local;
-+ HEVCRpiLocalContext * lc = NULL;
-+
-+ pthread_mutex_lock(&jbg->lock);
-+
-+ if (jbc != NULL)
-+ {
-+ av_assert1(jbc->jb1 == NULL);
-+
-+ // Release to Local if nothing waiting there
-+ if ((lc = jbc->lcw_head) == NULL)
-+ jbc->jb1 = jb;
-+ }
-+ else
-+ {
-+ // Release to global if nothing waiting there
-+ if ((lc = jbg->wait_head) == NULL)
-+ {
-+ jb->next = jbg->free1;
-+ jbg->free1 = jb;
-+ }
-+ else
-+ {
-+ // ? seems somehow mildy ugly...
-+ jbc = lc->context->jbc;
-+ }
-+ }
-+
-+ if (lc != NULL)
-+ {
-+ // Something was waiting
-+
-+ // Unlink
-+ // Global
-+ if (lc->jw_next == NULL)
-+ jbg->wait_tail = lc->jw_prev;
-+ else
-+ lc->jw_next->jw_prev = lc->jw_prev;
-+
-+ if (lc->jw_prev == NULL)
-+ jbg->wait_head = lc->jw_next;
-+ else
-+ lc->jw_prev->jw_next = lc->jw_next;
-+
-+ // Local
-+ if (lc->ljw_next == NULL)
-+ jbc->lcw_tail = lc->ljw_prev;
-+ else
-+ lc->ljw_next->ljw_prev = lc->ljw_prev;
-+
-+ if (lc->ljw_prev == NULL)
-+ jbc->lcw_head = lc->ljw_next;
-+ else
-+ lc->ljw_prev->ljw_next = lc->ljw_next;
-+
-+ // Update good if required
-+ if (jbg->wait_good == lc)
-+ jbg->wait_good = lc->jw_prev;
-+
-+ // Prod
-+ lc->jw_job = jb;
-+ sem_post(&lc->jw_sem);
-+ }
-+
-+ pthread_mutex_unlock(&jbg->lock);
-+}
-+
-+static void job_lc_kill(HEVCRpiLocalContext * const lc)
-+{
-+ sem_destroy(&lc->jw_sem);
-+}
-+
-+static void job_lc_init(HEVCRpiLocalContext * const lc)
-+{
-+ lc->jw_next = NULL;
-+ lc->jw_prev = NULL;
-+ lc->ljw_next = NULL;
-+ lc->ljw_prev = NULL;
-+ lc->jw_job = NULL;
-+ sem_init(&lc->jw_sem, 0, 0);
-+}
-+
-+// Returns:
-+// 0 if we have waited for MV or expect to wait for recon
-+// 1 if we haven't waited for MV & do not need to wait for recon
-+static int progress_good(const HEVCRpiContext *const s, const HEVCRpiJob * const jb)
-+{
-+ if (jb->waited) // reset by rpi_begin
-+ return 0;
-+ for (unsigned int i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i)
-+ {
-+ if (jb->progress_req[i] >= 0 && s->DPB[i].tf.progress != NULL &&
-+ ((volatile int *)(s->DPB[i].tf.progress->data))[0] < jb->progress_req[i])
-+ return 0;
-+ }
-+ return 1;
-+}
-+
-+// Submit job if it is full (indicated by having ctu_ts_last set >= 0)
-+static inline void worker_submit_job(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc)
-+{
-+ HEVCRpiJobCtl *const jbc = s->jbc;
-+ HEVCRpiJob * const jb = lc->jb0;
-+
-+ av_assert1(jb != NULL);
-+
-+ if (jb->ctu_ts_last < 0) {
-+ return;
-+ }
-+
-+ lc->last_progress_good = progress_good(s, jb);
-+ jb->waited = !lc->last_progress_good;
-+ lc->jb0 = NULL;
-+
-+ if (s->offload_recon)
-+ {
-+ pthread_mutex_lock(&jbc->in_lock);
-+ jbc->offloadq[jbc->offload_in] = jb;
-+ jbc->offload_in = utmod(jbc->offload_in + 1, RPI_MAX_JOBS);
-+ pthread_mutex_unlock(&jbc->in_lock);
-+
-+ pass_queue_submit_job(s->passq + 0); // Consumes job eventually
-+ }
-+ else
-+ {
-+ pass_queue_do_all(s, jb); // Consumes job before return
-+ }
-+}
-+
-+
-+// Call worker_pass0_ready to wait until the s->pass0_job slot becomes
-+// available to receive the next job.
-+//
-+// Now safe against multiple callers - needed for tiles
-+// "normal" and WPP will only call here one at a time
-+static inline void worker_pass0_ready(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ HEVCRpiJobCtl * const jbc = s->jbc;
-+
-+ // It is legit for us to already have a job allocated - do nothing in this case
-+ if (lc->jb0 != NULL)
-+ return;
-+
-+ if (s->offload_recon)
-+ rpi_sem_wait(&jbc->sem_out); // This sem will stop this frame grabbing too much
-+
-+ lc->jb0 = job_alloc(jbc, lc);
-+
-+ rpi_begin(s, lc->jb0, lc->ts);
-+}
-+
-+// Free up a job without submission
-+static void worker_free(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ HEVCRpiJobCtl * const jbc = s->jbc;
-+ HEVCRpiJob * const jb = lc->jb0;
-+
-+ if (jb == NULL) {
-+ return;
-+ }
-+
-+ lc->jb0 = NULL;
-+
-+ job_free(jbc, jb);
-+
-+ // If offload then poke sem_out too
-+ if (s->offload_recon) {
-+ sem_post(&jbc->sem_out);
-+ }
-+}
-+
-+
-+// Call this to wait for all jobs to have completed at the end of a frame
-+// Slightly icky as there is no clean way to wait for a sem to count up
-+// Not reentrant - call on main thread only
-+static void worker_wait(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc)
-+{
-+ HEVCRpiJobCtl * const jbc = s->jbc;
-+ int i = 0;
-+
-+ // We shouldn't reach here with an unsubmitted job
-+ av_assert1(lc->jb0 == NULL);
-+
-+ // If no offload then there can't be anything to wait for
-+ if (!s->offload_recon) {
-+ return;
-+ }
-+
-+ if (sem_getvalue(&jbc->sem_out, &i) == 0 && i < RPI_MAX_JOBS)
-+ {
-+ for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+ rpi_sem_wait(&jbc->sem_out);
-+ }
-+ for (i = 0; i != RPI_MAX_JOBS; ++i) {
-+ sem_post(&jbc->sem_out);
-+ }
-+ }
-+}
-+
-+static void * pass_worker(void *arg)
-+{
-+ HEVCRpiPassQueue *const pq = (HEVCRpiPassQueue *)arg;
-+ HEVCRpiContext *const s = pq->context;
-+
-+ for (;;)
-+ {
-+ rpi_sem_wait(&pq->sem_in);
-+
-+ if (pq->terminate)
-+ break;
-+
-+ pq->worker(s, s->jbc->offloadq[pass_queue_inc_job_n(pq)]);
-+ // * should really set jb->passes_done here
-+
-+ sem_post(pq->psem_out);
-+ }
-+ return NULL;
-+}
-+
-+static void pass_queues_start_all(HEVCRpiContext *const s)
-+{
-+ unsigned int i;
-+ HEVCRpiPassQueue * const pqs = s->passq;
-+
-+ for (i = 0; i != RPI_PASSES; ++i)
-+ {
-+ av_assert0(pthread_create(&pqs[i].thread, NULL, pass_worker, pqs + i) == 0);
-+ pqs[i].started = 1;
-+ }
-+}
-+
-+static void pass_queues_term_all(HEVCRpiContext *const s)
-+{
-+ unsigned int i;
-+ HEVCRpiPassQueue * const pqs = s->passq;
-+
-+ for (i = 0; i != RPI_PASSES; ++i)
-+ pqs[i].terminate = 1;
-+ for (i = 0; i != RPI_PASSES; ++i)
-+ {
-+ if (pqs[i].started)
-+ sem_post(&pqs[i].sem_in);
-+ }
-+ for (i = 0; i != RPI_PASSES; ++i)
-+ {
-+ if (pqs[i].started) {
-+ pthread_join(pqs[i].thread, NULL);
-+ pqs[i].started = 0;
-+ }
-+ }
-+}
-+
-+static void pass_queues_kill_all(HEVCRpiContext *const s)
-+{
-+ unsigned int i;
-+ HEVCRpiPassQueue * const pqs = s->passq;
-+
-+ for (i = 0; i != RPI_PASSES; ++i)
-+ pass_queue_kill(pqs + i);
-+}
-+
-+
-+static void worker_pic_free_one(HEVCRpiJob * const jb)
-+{
-+ // Free coeff stuff - allocation not the same for all buffers
-+ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+ if (cf->s[0].buf != NULL)
-+ av_freep(&cf->mptr);
-+ if (cf->s[2].buf != NULL)
-+ gpu_free(&cf->gptr);
-+ memset(cf, 0, sizeof(*cf));
-+}
-+
-+static int worker_pic_alloc_one(HEVCRpiJob * const jb, const unsigned int coeff_count)
-+{
-+ HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+
-+ if (gpu_malloc_cached((coeff_count + 32*32) * sizeof(cf->s[2].buf[0]), &cf->gptr) != 0)
-+ goto fail;
-+ cf->s[2].buf = (int16_t *)cf->gptr.arm;
-+ cf->s[3].buf = cf->s[2].buf + coeff_count;
-+
-+ // Must be 64 byte aligned for our zero zapping code so over-allocate &
-+ // round
-+ if ((cf->mptr = av_malloc(coeff_count * sizeof(cf->s[0].buf[0]) + 63)) == NULL)
-+ goto fail;
-+ cf->s[0].buf = (void *)(((intptr_t)cf->mptr + 63) & ~63);
-+ return 0;
-+
-+fail:
-+ av_log(NULL, AV_LOG_ERROR, "%s: Allocation failed\n", __func__);
-+ worker_pic_free_one(jb);
-+ return -1;
-+}
-+
-+static void worker_pic_reset(HEVCRpiCoeffsEnv * const cf)
-+{
-+ unsigned int i;
-+ for (i = 0; i != 4; ++i) {
-+ cf->s[i].n = 0;
-+#if RPI_COMPRESS_COEFFS
-+ cf->s[i].packed = 1;
-+ cf->s[i].packed_n = 0;
-+#endif
-+ }
-+}
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n)
-+{
-+ HEVCRpiCoeffEnv *const cfe = jb->coeffs.s + buf_no;
-+ int16_t * const coeffs = (buf_no != 3) ? cfe->buf + cfe->n : cfe->buf - (cfe->n + n);
-+ cfe->n += n;
-+ return coeffs;
-+}
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const HEVCRpiFrame * const ref, const int val, const int field)
-+{
-+ if (ref->tf.progress != NULL && ((int *)ref->tf.progress->data)[field] < val) {
-+ HEVCRpiContext *const fs = ref->tf.owner[field]->priv_data;
-+ HEVCRpiFrameProgressState * const pstate = fs->progress_states + field;
-+ sem_t * sem = NULL;
-+
-+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+ if (((volatile int *)ref->tf.progress->data)[field] < val) {
-+ HEVCRpiFrameProgressWait * const pwait = &jb->progress_wait;
-+
-+ av_assert1(pwait->req == -1 && pwait->next == NULL);
-+ jb->waited = 1; // Remember that we had to wait for later scheduling
-+
-+ pwait->req = val;
-+ pwait->next = NULL;
-+ if (pstate->first == NULL)
-+ pstate->first = pwait;
-+ else
-+ pstate->last->next = pwait;
-+ pstate->last = pwait;
-+ sem = &pwait->sem;
-+ }
-+ pthread_mutex_unlock(&pstate->lock);
-+
-+ if (sem != NULL) {
-+ rpi_sem_wait(sem);
-+ }
-+ }
-+}
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field)
-+{
-+ HEVCRpiFrameProgressState *const pstate = s->progress_states + field;
-+
-+ ((int *)s->ref->tf.progress->data)[field] = val;
-+
-+ av_assert0(pthread_mutex_lock(&pstate->lock) == 0);
-+ {
-+ HEVCRpiFrameProgressWait ** ppwait = &pstate->first;
-+ HEVCRpiFrameProgressWait * pwait;
-+
-+ while ((pwait = *ppwait) != NULL) {
-+ if (pwait->req > val)
-+ {
-+ ppwait = &pwait->next;
-+ pstate->last = pwait;
-+ }
-+ else
-+ {
-+ *ppwait = pwait->next;
-+ pwait->req = -1;
-+ pwait->next = NULL;
-+ sem_post(&pwait->sem);
-+ }
-+ }
-+ }
-+ pthread_mutex_unlock(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_init_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+ pstate->first = NULL;
-+ pstate->last = NULL;
-+ pthread_mutex_init(&pstate->lock, NULL);
-+}
-+
-+static void ff_hevc_rpi_progress_init_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+ pwait->req = -1;
-+ pwait->next = NULL;
-+ sem_init(&pwait->sem, 0, 0);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_state(HEVCRpiFrameProgressState * const pstate)
-+{
-+ av_assert1(pstate->first == NULL);
-+ pthread_mutex_destroy(&pstate->lock);
-+}
-+
-+static void ff_hevc_rpi_progress_kill_wait(HEVCRpiFrameProgressWait * const pwait)
-+{
-+ sem_destroy(&pwait->sem);
-+}
-+
-+
-+/**
-+ * NOTE: Each function hls_foo correspond to the function foo in the
-+ * specification (HLS stands for High Level Syntax).
-+ */
-+
-+/**
-+ * Section 5.7
-+ */
-+
-+// Realloc the entry point arrays
-+static int alloc_entry_points(RpiSliceHeader * const sh, const int n)
-+{
-+ if (sh->entry_point_offset == NULL || n > sh->offsets_allocated || n == 0)
-+ {
-+ // Round up alloc to multiple of 32
-+ int a = (n + 31) & ~31;
-+
-+ // We don't care about the previous contents so probably fastest to simply discard
-+ av_freep(&sh->entry_point_offset);
-+ av_freep(&sh->offset);
-+ av_freep(&sh->size);
-+
-+ if (a != 0)
-+ {
-+ sh->entry_point_offset = av_malloc_array(a, sizeof(unsigned));
-+ sh->offset = av_malloc_array(a, sizeof(int));
-+ sh->size = av_malloc_array(a, sizeof(int));
-+
-+ if (!sh->entry_point_offset || !sh->offset || !sh->size) {
-+ sh->num_entry_point_offsets = 0;
-+ sh->offsets_allocated = 0;
-+ return AVERROR(ENOMEM);
-+ }
-+ }
-+
-+ sh->offsets_allocated = a;
-+ }
-+
-+ return 0;
-+}
-+
-+/* free everything allocated by pic_arrays_init() */
-+static void pic_arrays_free(HEVCRpiContext *s)
-+{
-+ av_freep(&s->sao);
-+ av_freep(&s->deblock);
-+
-+ av_freep(&s->cabac_stash_up);
-+ s->cabac_stash_left = NULL; // freed with _up
-+
-+ av_freep(&s->mvf_up);
-+ av_freep(&s->mvf_left);
-+
-+ av_freep(&s->is_pcm);
-+ av_freep(&s->is_intra_store);
-+ s->is_intra = NULL;
-+ av_freep(&s->rpl_tab);
-+ s->rpl_tab_size = 0;
-+
-+ av_freep(&s->qp_y_tab);
-+ av_freep(&s->tab_slice_address);
-+ av_freep(&s->filter_slice_edges);
-+
-+ av_freep(&s->bs_horizontal);
-+ s->bs_vertical = NULL; // freed with H
-+ av_freep(&s->bsf_stash_left);
-+ av_freep(&s->bsf_stash_up);
-+
-+ av_freep(&s->rpl_up);
-+ av_freep(&s->rpl_left);
-+
-+ alloc_entry_points(&s->sh, 0);
-+
-+ av_buffer_pool_uninit(&s->col_mvf_pool);
-+}
-+
-+/* allocate arrays that depend on frame dimensions */
-+static int pic_arrays_init(HEVCRpiContext * const s, const HEVCRpiSPS * const sps)
-+{
-+ const unsigned int log2_min_cb_size = sps->log2_min_cb_size;
-+ const unsigned int width = sps->width;
-+ const unsigned int height = sps->height;
-+ const unsigned int pic_size_in_cb = ((width >> log2_min_cb_size) + 1) *
-+ ((height >> log2_min_cb_size) + 1);
-+ const unsigned int ctb_count = sps->ctb_size;
-+
-+ {
-+ unsigned int w = ((width + HEVC_RPI_BS_STRIDE1_PEL_MASK) & ~HEVC_RPI_BS_STRIDE1_PEL_MASK);
-+ unsigned int h = ((height + 15) & ~15);
-+
-+ s->bs_stride2 = h >> HEVC_RPI_BS_COL_BYTES_SHR; // Column size
-+ s->bs_size = s->bs_stride2 * (w >> HEVC_RPI_BS_STRIDE1_PEL_SHIFT); // col size * cols
-+ }
-+
-+ s->sao = av_mallocz(ctb_count * sizeof(*s->sao) + 8); // Our sao code overreads this array slightly
-+ s->deblock = av_mallocz_array(ctb_count, sizeof(*s->deblock));
-+ if (!s->sao || !s->deblock)
-+ goto fail;
-+
-+ s->cabac_stash_up = av_malloc((((width + 63) & ~63) >> 3) + (((height + 63) & ~63) >> 3));
-+ s->cabac_stash_left = s->cabac_stash_up + (((width + 63) & ~63) >> 3);
-+ if (s->cabac_stash_up == NULL)
-+ goto fail;
-+
-+ // Round width up to max ctb size
-+ s->mvf_up = av_malloc((((width + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+ // * Only needed if we have H tiles
-+ s->mvf_left = av_malloc((((height + 63) & ~63) >> LOG2_MIN_PU_SIZE) * sizeof(*s->mvf_up));
-+
-+ // We can overread by 1 line & one byte in deblock so alloc & zero
-+ // We don't need to zero the extra @ start of frame as it will never be
-+ // written
-+ s->is_pcm = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+ s->is_intra_store = av_mallocz(sps->pcm_width * (sps->pcm_height + 1) + 1);
-+ if (s->is_pcm == NULL || s->is_intra_store == NULL)
-+ goto fail;
-+
-+ s->filter_slice_edges = av_mallocz(ctb_count);
-+ s->tab_slice_address = av_malloc_array(ctb_count,
-+ sizeof(*s->tab_slice_address));
-+ s->qp_y_tab = av_malloc_array(pic_size_in_cb,
-+ sizeof(*s->qp_y_tab));
-+ if (!s->qp_y_tab || !s->filter_slice_edges || !s->tab_slice_address)
-+ goto fail;
-+
-+ s->bs_horizontal = av_mallocz(s->bs_size * 2);
-+ s->bs_vertical = s->bs_horizontal + s->bs_size;
-+ if (s->bs_horizontal == NULL)
-+ goto fail;
-+
-+ s->rpl_up = av_mallocz(sps->ctb_width * sizeof(*s->rpl_up));
-+ s->rpl_left = av_mallocz(sps->ctb_height * sizeof(*s->rpl_left));
-+ if (s->rpl_left == NULL || s->rpl_up == NULL)
-+ goto fail;
-+
-+ if ((s->bsf_stash_left = av_mallocz(((height + 63) & ~63) >> 4)) == NULL ||
-+ (s->bsf_stash_up = av_mallocz(((width + 63) & ~63) >> 4)) == NULL)
-+ goto fail;
-+
-+ s->col_mvf_stride = (width + 15) >> 4;
-+ s->col_mvf_pool = av_buffer_pool_init(((height + 15) >> 4) * s->col_mvf_stride * sizeof(ColMvField),
-+ av_buffer_allocz);
-+ if (s->col_mvf_pool == NULL)
-+ goto fail;
-+
-+ return 0;
-+
-+fail:
-+ pic_arrays_free(s);
-+ return AVERROR(ENOMEM);
-+}
-+
-+static void default_pred_weight_table(HEVCRpiContext * const s)
-+{
-+ unsigned int i;
-+ const unsigned int wt = 1 << QPU_MC_DENOM;
-+ s->sh.luma_log2_weight_denom = 0;
-+ s->sh.chroma_log2_weight_denom = 0;
-+ for (i = 0; i < s->sh.nb_refs[L0]; i++) {
-+ s->sh.luma_weight_l0[i] = wt;
-+ s->sh.luma_offset_l0[i] = 0;
-+ s->sh.chroma_weight_l0[i][0] = wt;
-+ s->sh.chroma_weight_l0[i][1] = wt;
-+ s->sh.chroma_offset_l0[i][0] = 0;
-+ s->sh.chroma_offset_l0[i][1] = 0;
-+ }
-+ for (i = 0; i < s->sh.nb_refs[L1]; i++) {
-+ s->sh.luma_weight_l1[i] = wt;
-+ s->sh.luma_offset_l1[i] = 0;
-+ s->sh.chroma_weight_l1[i][0] = wt;
-+ s->sh.chroma_weight_l1[i][1] = wt;
-+ s->sh.chroma_offset_l1[i][0] = 0;
-+ s->sh.chroma_offset_l1[i][1] = 0;
-+ }
-+}
-+
-+static int get_weights(HEVCRpiContext * const s, GetBitContext * const gb,
-+ const unsigned int refs,
-+ int16_t * luma_weight, int16_t * luma_offset,
-+ int16_t * chroma_weight, int16_t * chroma_offset)
-+{
-+ unsigned int luma_flags;
-+ unsigned int chroma_flags;
-+ unsigned int i;
-+ const unsigned int wp_offset_bd_shift = s->ps.sps->high_precision_offsets_enabled_flag ? 0 : (s->ps.sps->bit_depth - 8);
-+ const int wp_offset_half_range = s->ps.sps->wp_offset_half_range;
-+ const unsigned int luma_weight_base = 1 << QPU_MC_DENOM;
-+ const unsigned int chroma_weight_base = 1 << QPU_MC_DENOM;
-+ const unsigned int luma_weight_shift = (QPU_MC_DENOM - s->sh.luma_log2_weight_denom);
-+ const unsigned int chroma_weight_shift = (QPU_MC_DENOM - s->sh.chroma_log2_weight_denom);
-+
-+ if (refs == 0)
-+ return 0;
-+
-+ luma_flags = get_bits(gb, refs);
-+ chroma_flags = ctx_cfmt(s) == 0 ? 0 : get_bits(gb, refs);
-+ i = 1 << (refs - 1);
-+
-+ do
-+ {
-+ if ((luma_flags & i) != 0)
-+ {
-+ const int delta_weight = get_se_golomb(gb);
-+ const int offset = get_se_golomb(gb);
-+ if (delta_weight < -128 || delta_weight > 127 ||
-+ offset < -wp_offset_half_range || offset >= wp_offset_half_range)
-+ {
-+ return AVERROR_INVALIDDATA;
-+ }
-+ *luma_weight++ = luma_weight_base + (delta_weight << luma_weight_shift);
-+ *luma_offset++ = offset << wp_offset_bd_shift;
-+ }
-+ else
-+ {
-+ *luma_weight++ = luma_weight_base;
-+ *luma_offset++ = 0;
-+ }
-+
-+ if ((chroma_flags & i) != 0)
-+ {
-+ unsigned int j;
-+ for (j = 0; j != 2; ++j)
-+ {
-+ const int delta_weight = get_se_golomb(gb);
-+ const int delta_offset = get_se_golomb(gb);
-+
-+ if (delta_weight < -128 || delta_weight > 127 ||
-+ delta_offset < -4 * wp_offset_half_range || delta_offset >= 4 * wp_offset_half_range)
-+ {
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ *chroma_weight++ = chroma_weight_base + (delta_weight << chroma_weight_shift);
-+ *chroma_offset++ = av_clip(
-+ wp_offset_half_range + delta_offset -
-+ ((wp_offset_half_range * ((1 << s->sh.chroma_log2_weight_denom) + delta_weight)) >> s->sh.chroma_log2_weight_denom),
-+ -wp_offset_half_range, wp_offset_half_range - 1) << wp_offset_bd_shift;
-+ }
-+ }
-+ else
-+ {
-+ *chroma_weight++ = chroma_weight_base;
-+ *chroma_weight++ = chroma_weight_base;
-+ *chroma_offset++ = 0;
-+ *chroma_offset++ = 0;
-+ }
-+ } while ((i >>= 1) != 0);
-+
-+ return 0;
-+}
-+
-+static int pred_weight_table(HEVCRpiContext *s, GetBitContext *gb)
-+{
-+ int err;
-+ const unsigned int luma_log2_weight_denom = get_ue_golomb_long(gb);
-+ const unsigned int chroma_log2_weight_denom = (ctx_cfmt(s) == 0) ? 0 : luma_log2_weight_denom + get_se_golomb(gb);
-+
-+ if (luma_log2_weight_denom > 7 ||
-+ chroma_log2_weight_denom > 7)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight denom: luma=%d, chroma=%d\n",
-+ luma_log2_weight_denom, chroma_log2_weight_denom);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ s->sh.luma_log2_weight_denom = luma_log2_weight_denom;
-+ s->sh.chroma_log2_weight_denom = chroma_log2_weight_denom;
-+
-+ if ((err = get_weights(s, gb, s->sh.nb_refs[L0],
-+ s->sh.luma_weight_l0, s->sh.luma_offset_l0,
-+ s->sh.chroma_weight_l0[0], s->sh.chroma_offset_l0[0])) != 0 ||
-+ (err = get_weights(s, gb, s->sh.nb_refs[L1],
-+ s->sh.luma_weight_l1, s->sh.luma_offset_l1,
-+ s->sh.chroma_weight_l1[0], s->sh.chroma_offset_l1[0])) != 0)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Invalid prediction weight or offset\n");
-+ return err;
-+ }
-+
-+ return 0;
-+}
-+
-+static int decode_lt_rps(HEVCRpiContext *s, LongTermRPS *rps, GetBitContext *gb)
-+{
-+ const HEVCRpiSPS *sps = s->ps.sps;
-+ int max_poc_lsb = 1 << sps->log2_max_poc_lsb;
-+ int prev_delta_msb = 0;
-+ unsigned int nb_sps = 0, nb_sh;
-+ int i;
-+
-+ rps->nb_refs = 0;
-+ if (!sps->long_term_ref_pics_present_flag)
-+ return 0;
-+
-+ if (sps->num_long_term_ref_pics_sps > 0)
-+ nb_sps = get_ue_golomb_long(gb);
-+ nb_sh = get_ue_golomb_long(gb);
-+
-+ if (nb_sps > sps->num_long_term_ref_pics_sps)
-+ return AVERROR_INVALIDDATA;
-+ if (nb_sh + (uint64_t)nb_sps > FF_ARRAY_ELEMS(rps->poc))
-+ return AVERROR_INVALIDDATA;
-+
-+ rps->nb_refs = nb_sh + nb_sps;
-+
-+ for (i = 0; i < rps->nb_refs; i++) {
-+ uint8_t delta_poc_msb_present;
-+
-+ if (i < nb_sps) {
-+ uint8_t lt_idx_sps = 0;
-+
-+ if (sps->num_long_term_ref_pics_sps > 1)
-+ lt_idx_sps = get_bits(gb, av_ceil_log2(sps->num_long_term_ref_pics_sps));
-+
-+ rps->poc[i] = sps->lt_ref_pic_poc_lsb_sps[lt_idx_sps];
-+ rps->used[i] = sps->used_by_curr_pic_lt_sps_flag[lt_idx_sps];
-+ } else {
-+ rps->poc[i] = get_bits(gb, sps->log2_max_poc_lsb);
-+ rps->used[i] = get_bits1(gb);
-+ }
-+
-+ delta_poc_msb_present = get_bits1(gb);
-+ if (delta_poc_msb_present) {
-+ int64_t delta = get_ue_golomb_long(gb);
-+ int64_t poc;
-+
-+ if (i && i != nb_sps)
-+ delta += prev_delta_msb;
-+
-+ poc = rps->poc[i] + s->poc - delta * max_poc_lsb - s->sh.pic_order_cnt_lsb;
-+ if (poc != (int32_t)poc)
-+ return AVERROR_INVALIDDATA;
-+ rps->poc[i] = poc;
-+ prev_delta_msb = delta;
-+ }
-+ }
-+
-+ return 0;
-+}
-+
-+static void export_stream_params(AVCodecContext *avctx, const HEVCRpiParamSets *ps,
-+ const HEVCRpiSPS *sps)
-+{
-+ const HEVCRpiVPS *vps = (const HEVCRpiVPS*)ps->vps_list[sps->vps_id]->data;
-+ const HEVCRpiWindow *ow = &sps->output_window;
-+ unsigned int num = 0, den = 0;
-+
-+ avctx->pix_fmt = sps->pix_fmt;
-+ avctx->coded_width = sps->width;
-+ avctx->coded_height = sps->height;
-+ avctx->width = sps->width - ow->left_offset - ow->right_offset;
-+ avctx->height = sps->height - ow->top_offset - ow->bottom_offset;
-+ avctx->has_b_frames = sps->temporal_layer[sps->max_sub_layers - 1].num_reorder_pics;
-+ avctx->profile = sps->ptl.general_ptl.profile_idc;
-+ avctx->level = sps->ptl.general_ptl.level_idc;
-+
-+ ff_set_sar(avctx, sps->vui.sar);
-+
-+ if (sps->vui.video_signal_type_present_flag)
-+ avctx->color_range = sps->vui.video_full_range_flag ? AVCOL_RANGE_JPEG
-+ : AVCOL_RANGE_MPEG;
-+ else
-+ avctx->color_range = AVCOL_RANGE_MPEG;
-+
-+ if (sps->vui.colour_description_present_flag) {
-+ avctx->color_primaries = sps->vui.colour_primaries;
-+ avctx->color_trc = sps->vui.transfer_characteristic;
-+ avctx->colorspace = sps->vui.matrix_coeffs;
-+ } else {
-+ avctx->color_primaries = AVCOL_PRI_UNSPECIFIED;
-+ avctx->color_trc = AVCOL_TRC_UNSPECIFIED;
-+ avctx->colorspace = AVCOL_SPC_UNSPECIFIED;
-+ }
-+
-+ if (vps->vps_timing_info_present_flag) {
-+ num = vps->vps_num_units_in_tick;
-+ den = vps->vps_time_scale;
-+ } else if (sps->vui.vui_timing_info_present_flag) {
-+ num = sps->vui.vui_num_units_in_tick;
-+ den = sps->vui.vui_time_scale;
-+ }
-+
-+ if (num != 0 && den != 0)
-+ av_reduce(&avctx->framerate.den, &avctx->framerate.num,
-+ num, den, 1 << 30);
-+}
-+
-+static enum AVPixelFormat get_format(HEVCRpiContext *s, const HEVCRpiSPS *sps)
-+{
-+ enum AVPixelFormat pix_fmts[4], *fmt = pix_fmts;
-+
-+ // Admit to no h/w formats
-+
-+ *fmt++ = sps->pix_fmt;
-+ *fmt = AV_PIX_FMT_NONE;
-+
-+ return pix_fmts[0] == AV_PIX_FMT_NONE ? AV_PIX_FMT_NONE: ff_thread_get_format(s->avctx, pix_fmts);
-+}
-+
-+static int is_sps_supported(const HEVCRpiSPS * const sps)
-+{
-+ return av_rpi_is_sand_format(sps->pix_fmt) &&
-+ sps->width <= HEVC_RPI_MAX_WIDTH &&
-+ sps->height <= HEVC_RPI_MAX_HEIGHT;
-+}
-+
-+static int set_sps(HEVCRpiContext * const s, const HEVCRpiSPS * const sps,
-+ const enum AVPixelFormat pix_fmt)
-+{
-+ int ret;
-+
-+ pic_arrays_free(s);
-+ s->ps.sps = NULL;
-+ s->ps.vps = NULL;
-+
-+ if (sps == NULL)
-+ return 0;
-+
-+ if (!is_sps_supported(sps))
-+ return AVERROR_DECODER_NOT_FOUND;
-+
-+ ret = pic_arrays_init(s, sps);
-+ if (ret < 0)
-+ goto fail;
-+
-+ export_stream_params(s->avctx, &s->ps, sps);
-+
-+ s->avctx->pix_fmt = pix_fmt;
-+
-+ ff_hevc_rpi_pred_init(&s->hpc, sps->bit_depth);
-+ ff_hevc_rpi_dsp_init (&s->hevcdsp, sps->bit_depth);
-+
-+ // * We don't support cross_component_prediction_enabled_flag but as that
-+ // must be 0 unless we have 4:4:4 there is no point testing for it as we
-+ // only deal with sand which is never 4:4:4
-+ // [support wouldn't be hard]
-+
-+ rpi_hevc_qpu_set_fns(s, sps->bit_depth);
-+
-+ av_freep(&s->sao_pixel_buffer_h[0]);
-+ av_freep(&s->sao_pixel_buffer_v[0]);
-+
-+ if (sps->sao_enabled)
-+ {
-+ const unsigned int c_count = (ctx_cfmt(s) != 0) ? 3 : 1;
-+ unsigned int c_idx;
-+ size_t vsize[3] = {0};
-+ size_t hsize[3] = {0};
-+
-+ for(c_idx = 0; c_idx < c_count; c_idx++) {
-+ int w = sps->width >> ctx_hshift(s, c_idx);
-+ int h = sps->height >> ctx_vshift(s, c_idx);
-+ // ctb height & width are a min of 8 so this must a multiple of 16
-+ // so no point rounding up!
-+ hsize[c_idx] = (w * 2 * sps->ctb_height) << sps->pixel_shift;
-+ vsize[c_idx] = (h * 2 * sps->ctb_width) << sps->pixel_shift;
-+ }
-+
-+ // Allocate as a single lump so we can extend h[1] & v[1] into h[2] & v[2]
-+ // when we have plaited chroma
-+ s->sao_pixel_buffer_h[0] = av_malloc(hsize[0] + hsize[1] + hsize[2]);
-+ s->sao_pixel_buffer_v[0] = av_malloc(vsize[0] + vsize[1] + vsize[2]);
-+ s->sao_pixel_buffer_h[1] = s->sao_pixel_buffer_h[0] + hsize[0];
-+ s->sao_pixel_buffer_h[2] = s->sao_pixel_buffer_h[1] + hsize[1];
-+ s->sao_pixel_buffer_v[1] = s->sao_pixel_buffer_v[0] + vsize[0];
-+ s->sao_pixel_buffer_v[2] = s->sao_pixel_buffer_v[1] + vsize[1];
-+ }
-+
-+ s->ps.sps = sps;
-+ s->ps.vps = (HEVCRpiVPS*) s->ps.vps_list[s->ps.sps->vps_id]->data;
-+
-+ return 0;
-+
-+fail:
-+ pic_arrays_free(s);
-+ s->ps.sps = NULL;
-+ return ret;
-+}
-+
-+static inline int qp_offset_valid(const int qp_offset)
-+{
-+ return qp_offset >= -12 && qp_offset <= 12;
-+}
-+
-+static int hls_slice_header(HEVCRpiContext * const s)
-+{
-+ GetBitContext * const gb = &s->HEVClc->gb;
-+ RpiSliceHeader * const sh = &s->sh;
-+ int i, ret;
-+
-+ // Coded parameters
-+ sh->first_slice_in_pic_flag = get_bits1(gb);
-+ if ((IS_IDR(s) || IS_BLA(s)) && sh->first_slice_in_pic_flag) {
-+ s->seq_decode = (s->seq_decode + 1) & 0xff;
-+ s->max_ra = INT_MAX;
-+ if (IS_IDR(s))
-+ ff_hevc_rpi_clear_refs(s);
-+ }
-+ sh->no_output_of_prior_pics_flag = 0;
-+ if (IS_IRAP(s))
-+ sh->no_output_of_prior_pics_flag = get_bits1(gb);
-+
-+ sh->pps_id = get_ue_golomb_long(gb);
-+ if (sh->pps_id >= HEVC_MAX_PPS_COUNT || !s->ps.pps_list[sh->pps_id]) {
-+ av_log(s->avctx, AV_LOG_ERROR, "PPS id out of range: %d\n", sh->pps_id);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ if (!sh->first_slice_in_pic_flag &&
-+ s->ps.pps != (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data) {
-+ av_log(s->avctx, AV_LOG_ERROR, "PPS changed between slices.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ s->ps.pps = (HEVCRpiPPS*)s->ps.pps_list[sh->pps_id]->data;
-+ if (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos == 1)
-+ sh->no_output_of_prior_pics_flag = 1;
-+
-+ if (s->ps.sps != (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data) {
-+ const HEVCRpiSPS *sps = (HEVCRpiSPS*)s->ps.sps_list[s->ps.pps->sps_id]->data;
-+ const HEVCRpiSPS *last_sps = s->ps.sps;
-+ enum AVPixelFormat pix_fmt;
-+
-+ if (last_sps && IS_IRAP(s) && s->nal_unit_type != HEVC_NAL_CRA_NUT) {
-+ if (sps->width != last_sps->width || sps->height != last_sps->height ||
-+ sps->temporal_layer[sps->max_sub_layers - 1].max_dec_pic_buffering !=
-+ last_sps->temporal_layer[last_sps->max_sub_layers - 1].max_dec_pic_buffering)
-+ sh->no_output_of_prior_pics_flag = 0;
-+ }
-+ ff_hevc_rpi_clear_refs(s);
-+
-+ ret = set_sps(s, sps, sps->pix_fmt);
-+ if (ret < 0)
-+ return ret;
-+
-+ pix_fmt = get_format(s, sps);
-+ if (pix_fmt < 0)
-+ return pix_fmt;
-+
-+// ret = set_sps(s, sps, pix_fmt);
-+// if (ret < 0)
-+// return ret;
-+
-+ s->avctx->pix_fmt = pix_fmt;
-+
-+ s->seq_decode = (s->seq_decode + 1) & 0xff;
-+ s->max_ra = INT_MAX;
-+ }
-+
-+ sh->dependent_slice_segment_flag = 0;
-+ if (!sh->first_slice_in_pic_flag) {
-+ int slice_address_length;
-+
-+ if (s->ps.pps->dependent_slice_segments_enabled_flag)
-+ sh->dependent_slice_segment_flag = get_bits1(gb);
-+
-+ slice_address_length = av_ceil_log2(s->ps.sps->ctb_size);
-+ sh->slice_segment_addr = get_bitsz(gb, slice_address_length);
-+ if (sh->slice_segment_addr >= s->ps.sps->ctb_size) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Invalid slice segment address: %u.\n",
-+ sh->slice_segment_addr);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (!sh->dependent_slice_segment_flag) {
-+ sh->slice_addr = sh->slice_segment_addr;
-+ s->slice_idx++;
-+ }
-+ } else {
-+ sh->slice_segment_addr = sh->slice_addr = 0;
-+ s->slice_idx = 0;
-+ s->slice_initialized = 0;
-+ }
-+
-+ if (!sh->dependent_slice_segment_flag) {
-+ s->slice_initialized = 0;
-+
-+ for (i = 0; i < s->ps.pps->num_extra_slice_header_bits; i++)
-+ skip_bits(gb, 1); // slice_reserved_undetermined_flag[]
-+
-+ sh->slice_type = get_ue_golomb_long(gb);
-+ if (!(sh->slice_type == HEVC_SLICE_I ||
-+ sh->slice_type == HEVC_SLICE_P ||
-+ sh->slice_type == HEVC_SLICE_B)) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Unknown slice type: %d.\n",
-+ sh->slice_type);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ if (IS_IRAP(s) && sh->slice_type != HEVC_SLICE_I) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Inter slices in an IRAP frame.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ // when flag is not present, picture is inferred to be output
-+ sh->pic_output_flag = 1;
-+ if (s->ps.pps->output_flag_present_flag)
-+ sh->pic_output_flag = get_bits1(gb);
-+
-+ if (s->ps.sps->separate_colour_plane_flag)
-+ sh->colour_plane_id = get_bits(gb, 2);
-+
-+ if (!IS_IDR(s)) {
-+ int poc, pos;
-+
-+ sh->pic_order_cnt_lsb = get_bits(gb, s->ps.sps->log2_max_poc_lsb);
-+ poc = ff_hevc_rpi_compute_poc(s->ps.sps, s->pocTid0, sh->pic_order_cnt_lsb, s->nal_unit_type);
-+ if (!sh->first_slice_in_pic_flag && poc != s->poc) {
-+ av_log(s->avctx, AV_LOG_WARNING,
-+ "Ignoring POC change between slices: %d -> %d\n", s->poc, poc);
-+ if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+ return AVERROR_INVALIDDATA;
-+ poc = s->poc;
-+ }
-+ s->poc = poc;
-+
-+ sh->short_term_ref_pic_set_sps_flag = get_bits1(gb);
-+ pos = get_bits_left(gb);
-+ if (!sh->short_term_ref_pic_set_sps_flag) {
-+ ret = ff_hevc_rpi_decode_short_term_rps(gb, s->avctx, &sh->slice_rps, s->ps.sps, 1);
-+ if (ret < 0)
-+ return ret;
-+
-+ sh->short_term_rps = &sh->slice_rps;
-+ } else {
-+ int numbits, rps_idx;
-+
-+ if (!s->ps.sps->nb_st_rps) {
-+ av_log(s->avctx, AV_LOG_ERROR, "No ref lists in the SPS.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ numbits = av_ceil_log2(s->ps.sps->nb_st_rps);
-+ rps_idx = numbits > 0 ? get_bits(gb, numbits) : 0;
-+ sh->short_term_rps = &s->ps.sps->st_rps[rps_idx];
-+ }
-+ sh->short_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+ pos = get_bits_left(gb);
-+ ret = decode_lt_rps(s, &sh->long_term_rps, gb);
-+ if (ret < 0) {
-+ av_log(s->avctx, AV_LOG_WARNING, "Invalid long term RPS.\n");
-+ if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+ return AVERROR_INVALIDDATA;
-+ }
-+ sh->long_term_ref_pic_set_size = pos - get_bits_left(gb);
-+
-+ if (s->ps.sps->sps_temporal_mvp_enabled_flag)
-+ sh->slice_temporal_mvp_enabled_flag = get_bits1(gb);
-+ else
-+ sh->slice_temporal_mvp_enabled_flag = 0;
-+ } else {
-+ s->sh.short_term_rps = NULL;
-+ s->poc = 0;
-+ }
-+
-+ /* 8.3.1 */
-+ if (sh->first_slice_in_pic_flag && s->temporal_id == 0 &&
-+ s->nal_unit_type != HEVC_NAL_TRAIL_N &&
-+ s->nal_unit_type != HEVC_NAL_TSA_N &&
-+ s->nal_unit_type != HEVC_NAL_STSA_N &&
-+ s->nal_unit_type != HEVC_NAL_RADL_N &&
-+ s->nal_unit_type != HEVC_NAL_RADL_R &&
-+ s->nal_unit_type != HEVC_NAL_RASL_N &&
-+ s->nal_unit_type != HEVC_NAL_RASL_R)
-+ s->pocTid0 = s->poc;
-+
-+ if (s->ps.sps->sao_enabled) {
-+ sh->slice_sample_adaptive_offset_flag[0] = get_bits1(gb);
-+ if (ctx_cfmt(s) != 0) {
-+ sh->slice_sample_adaptive_offset_flag[1] =
-+ sh->slice_sample_adaptive_offset_flag[2] = get_bits1(gb);
-+ }
-+ } else {
-+ sh->slice_sample_adaptive_offset_flag[0] = 0;
-+ sh->slice_sample_adaptive_offset_flag[1] = 0;
-+ sh->slice_sample_adaptive_offset_flag[2] = 0;
-+ }
-+
-+ sh->nb_refs[L0] = sh->nb_refs[L1] = 0;
-+ if (sh->slice_type == HEVC_SLICE_P || sh->slice_type == HEVC_SLICE_B) {
-+ int nb_refs;
-+
-+ sh->nb_refs[L0] = s->ps.pps->num_ref_idx_l0_default_active;
-+ if (sh->slice_type == HEVC_SLICE_B)
-+ sh->nb_refs[L1] = s->ps.pps->num_ref_idx_l1_default_active;
-+
-+ if (get_bits1(gb)) { // num_ref_idx_active_override_flag
-+ sh->nb_refs[L0] = get_ue_golomb_long(gb) + 1;
-+ if (sh->slice_type == HEVC_SLICE_B)
-+ sh->nb_refs[L1] = get_ue_golomb_long(gb) + 1;
-+ }
-+ if (sh->nb_refs[L0] > HEVC_MAX_REFS || sh->nb_refs[L1] > HEVC_MAX_REFS) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Too many refs: %d/%d.\n",
-+ sh->nb_refs[L0], sh->nb_refs[L1]);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sh->rpl_modification_flag[0] = 0;
-+ sh->rpl_modification_flag[1] = 0;
-+ nb_refs = ff_hevc_rpi_frame_nb_refs(s);
-+ if (!nb_refs) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Zero refs for a frame with P or B slices.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (s->ps.pps->lists_modification_present_flag && nb_refs > 1) {
-+ sh->rpl_modification_flag[0] = get_bits1(gb);
-+ if (sh->rpl_modification_flag[0]) {
-+ for (i = 0; i < sh->nb_refs[L0]; i++)
-+ sh->list_entry_lx[0][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+ }
-+
-+ if (sh->slice_type == HEVC_SLICE_B) {
-+ sh->rpl_modification_flag[1] = get_bits1(gb);
-+ if (sh->rpl_modification_flag[1] == 1)
-+ for (i = 0; i < sh->nb_refs[L1]; i++)
-+ sh->list_entry_lx[1][i] = get_bits(gb, av_ceil_log2(nb_refs));
-+ }
-+ }
-+
-+ if (sh->slice_type == HEVC_SLICE_B)
-+ sh->mvd_l1_zero_flag = get_bits1(gb);
-+
-+ if (s->ps.pps->cabac_init_present_flag)
-+ sh->cabac_init_flag = get_bits1(gb);
-+ else
-+ sh->cabac_init_flag = 0;
-+
-+ sh->collocated_ref_idx = 0;
-+ if (sh->slice_temporal_mvp_enabled_flag) {
-+ sh->collocated_list = L0;
-+ if (sh->slice_type == HEVC_SLICE_B)
-+ sh->collocated_list = !get_bits1(gb);
-+
-+ if (sh->nb_refs[sh->collocated_list] > 1) {
-+ sh->collocated_ref_idx = get_ue_golomb_long(gb);
-+ if (sh->collocated_ref_idx >= sh->nb_refs[sh->collocated_list]) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Invalid collocated_ref_idx: %d.\n",
-+ sh->collocated_ref_idx);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+ }
-+
-+ if ((s->ps.pps->weighted_pred_flag && sh->slice_type == HEVC_SLICE_P) ||
-+ (s->ps.pps->weighted_bipred_flag && sh->slice_type == HEVC_SLICE_B))
-+ {
-+ if ((ret = pred_weight_table(s, gb)) != 0)
-+ return ret;
-+ }
-+ else
-+ {
-+ // Give us unit weights
-+ default_pred_weight_table(s);
-+ }
-+
-+ sh->max_num_merge_cand = 5 - get_ue_golomb_long(gb);
-+ if (sh->max_num_merge_cand < 1 || sh->max_num_merge_cand > 5) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Invalid number of merging MVP candidates: %d.\n",
-+ sh->max_num_merge_cand);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+
-+ sh->slice_qp_delta = get_se_golomb(gb);
-+
-+ if (s->ps.pps->pic_slice_level_chroma_qp_offsets_present_flag) {
-+ sh->slice_cb_qp_offset = get_se_golomb(gb);
-+ sh->slice_cr_qp_offset = get_se_golomb(gb);
-+ if (!qp_offset_valid(sh->slice_cb_qp_offset) ||
-+ !qp_offset_valid(s->ps.pps->cb_qp_offset + sh->slice_cb_qp_offset) ||
-+ !qp_offset_valid(sh->slice_cr_qp_offset) ||
-+ !qp_offset_valid(s->ps.pps->cr_qp_offset + sh->slice_cr_qp_offset))
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Bad chroma offset (pps:%d/%d; slice=%d/%d\n",
-+ sh->slice_cr_qp_offset, sh->slice_cr_qp_offset,
-+ s->ps.pps->cb_qp_offset, s->ps.pps->cr_qp_offset);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ } else
-+ {
-+ sh->slice_cb_qp_offset = 0;
-+ sh->slice_cr_qp_offset = 0;
-+ }
-+
-+ if (s->ps.pps->chroma_qp_offset_list_enabled_flag)
-+ sh->cu_chroma_qp_offset_enabled_flag = get_bits1(gb);
-+ else
-+ sh->cu_chroma_qp_offset_enabled_flag = 0;
-+
-+ if (s->ps.pps->deblocking_filter_control_present_flag) {
-+ int deblocking_filter_override_flag = 0;
-+
-+ if (s->ps.pps->deblocking_filter_override_enabled_flag)
-+ deblocking_filter_override_flag = get_bits1(gb);
-+
-+ if (deblocking_filter_override_flag) {
-+ sh->disable_deblocking_filter_flag = get_bits1(gb);
-+ if (!sh->disable_deblocking_filter_flag) {
-+ int beta_offset_div2 = get_se_golomb(gb);
-+ int tc_offset_div2 = get_se_golomb(gb) ;
-+ if (beta_offset_div2 < -6 || beta_offset_div2 > 6 ||
-+ tc_offset_div2 < -6 || tc_offset_div2 > 6) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Invalid deblock filter offsets: %d, %d\n",
-+ beta_offset_div2, tc_offset_div2);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ sh->beta_offset = beta_offset_div2 * 2;
-+ sh->tc_offset = tc_offset_div2 * 2;
-+ }
-+ } else {
-+ sh->disable_deblocking_filter_flag = s->ps.pps->disable_dbf;
-+ sh->beta_offset = s->ps.pps->beta_offset;
-+ sh->tc_offset = s->ps.pps->tc_offset;
-+ }
-+ } else {
-+ sh->disable_deblocking_filter_flag = 0;
-+ sh->beta_offset = 0;
-+ sh->tc_offset = 0;
-+ }
-+
-+ if (s->ps.pps->seq_loop_filter_across_slices_enabled_flag &&
-+ (sh->slice_sample_adaptive_offset_flag[0] ||
-+ sh->slice_sample_adaptive_offset_flag[1] ||
-+ !sh->disable_deblocking_filter_flag)) {
-+ sh->slice_loop_filter_across_slices_enabled_flag = get_bits1(gb);
-+ } else {
-+ sh->slice_loop_filter_across_slices_enabled_flag = s->ps.pps->seq_loop_filter_across_slices_enabled_flag;
-+ }
-+ sh->no_dblk_boundary_flags =
-+ (sh->slice_loop_filter_across_slices_enabled_flag ? 0 :
-+ BOUNDARY_UPPER_SLICE | BOUNDARY_LEFT_SLICE) |
-+ (s->ps.pps->loop_filter_across_tiles_enabled_flag ? 0 :
-+ BOUNDARY_UPPER_TILE | BOUNDARY_LEFT_TILE);
-+
-+
-+ } else if (!s->slice_initialized) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Independent slice segment missing.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sh->num_entry_point_offsets = 0;
-+ sh->offload_wpp = 0;
-+ sh->offload_tiles = 0;
-+
-+ if (s->ps.pps->tiles_enabled_flag || s->ps.pps->entropy_coding_sync_enabled_flag) {
-+ unsigned num_entry_point_offsets = get_ue_golomb_long(gb);
-+ // It would be possible to bound this tighter but this here is simpler
-+ if (num_entry_point_offsets > get_bits_left(gb)) {
-+ av_log(s->avctx, AV_LOG_ERROR, "num_entry_point_offsets %d is invalid\n", num_entry_point_offsets);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ sh->num_entry_point_offsets = num_entry_point_offsets;
-+ if (sh->num_entry_point_offsets > 0) {
-+ int offset_len = get_ue_golomb_long(gb) + 1;
-+
-+ if (offset_len < 1 || offset_len > 32) {
-+ sh->num_entry_point_offsets = 0;
-+ av_log(s->avctx, AV_LOG_ERROR, "offset_len %d is invalid\n", offset_len);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if ((ret = alloc_entry_points(sh, sh->num_entry_point_offsets)) < 0)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Failed to allocate memory\n");
-+ return ret;
-+ }
-+
-+ for (i = 0; i < sh->num_entry_point_offsets; i++) {
-+ uint32_t val_minus1 = get_bits_long(gb, offset_len);
-+ if (val_minus1 > (1 << 28))
-+ {
-+ // We can declare offsets of > 2^28 bad without loss of generality
-+ // Will check actual bounds wrt NAL later, but this keeps
-+ // the values within bounds we can deal with easily
-+ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset_minus1 %d invalid\n", val_minus1);
-+ return AVERROR_INVALIDDATA;
-+ }
-+ sh->entry_point_offset[i] = val_minus1 + 1; // +1 to get the size
-+ }
-+
-+ // Do we want to offload this
-+ if (s->threads_type != 0)
-+ {
-+ sh->offload_tiles = (!s->ps.pps->tile_wpp_inter_disable || sh->slice_type == HEVC_SLICE_I) &&
-+ s->ps.pps->num_tile_columns > 1;
-+ // * We only cope with WPP in a single column
-+ // Probably want to deal with that case as tiles rather than WPP anyway
-+ // ?? Not actually sure that the main code deals with WPP + multi-col correctly
-+ sh->offload_wpp = s->ps.pps->entropy_coding_sync_enabled_flag &&
-+ s->ps.pps->num_tile_columns == 1;
-+ }
-+ }
-+ }
-+
-+ if (s->ps.pps->slice_header_extension_present_flag) {
-+ unsigned int length = get_ue_golomb_long(gb);
-+ if (length*8LL > get_bits_left(gb)) {
-+ av_log(s->avctx, AV_LOG_ERROR, "too many slice_header_extension_data_bytes\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ for (i = 0; i < length; i++)
-+ skip_bits(gb, 8); // slice_header_extension_data_byte
-+ }
-+
-+ // Inferred parameters
-+ sh->slice_qp = 26U + s->ps.pps->pic_init_qp_minus26 + sh->slice_qp_delta;
-+ if (sh->slice_qp > 51 ||
-+ sh->slice_qp < -s->ps.sps->qp_bd_offset) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "The slice_qp %d is outside the valid range "
-+ "[%d, 51].\n",
-+ sh->slice_qp,
-+ -s->ps.sps->qp_bd_offset);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (get_bits_left(gb) < 0) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Overread slice header by %d bits\n", -get_bits_left(gb));
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ s->slice_initialized = 1;
-+ return 0;
-+}
-+
-+static void hls_sao_param(const HEVCRpiContext *s, HEVCRpiLocalContext * const lc, const int rx, const int ry)
-+{
-+ RpiSAOParams * const sao = s->sao + rx + ry * s->ps.sps->ctb_width;
-+ int c_idx, i;
-+
-+ if (s->sh.slice_sample_adaptive_offset_flag[0] ||
-+ s->sh.slice_sample_adaptive_offset_flag[1]) {
-+ if ((lc->ctb_avail & AVAIL_L) != 0)
-+ {
-+ const int sao_merge_left_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+ if (sao_merge_left_flag) {
-+ *sao = sao[-1];
-+ return;
-+ }
-+ }
-+ if ((lc->ctb_avail & AVAIL_U) != 0)
-+ {
-+ const int sao_merge_up_flag = ff_hevc_rpi_sao_merge_flag_decode(lc);
-+ if (sao_merge_up_flag) {
-+ *sao = sao[-(int)s->ps.sps->ctb_width];
-+ return;
-+ }
-+ }
-+ }
-+
-+ for (c_idx = 0; c_idx < (ctx_cfmt(s) != 0 ? 3 : 1); c_idx++) {
-+ const unsigned int log2_sao_offset_scale = c_idx == 0 ? s->ps.pps->log2_sao_offset_scale_luma :
-+ s->ps.pps->log2_sao_offset_scale_chroma;
-+ int offset_abs[4];
-+ char offset_sign[4] = {0};
-+
-+ if (!s->sh.slice_sample_adaptive_offset_flag[c_idx]) {
-+ sao->type_idx[c_idx] = SAO_NOT_APPLIED;
-+ continue;
-+ }
-+
-+ if (c_idx == 2) {
-+ sao->type_idx[2] = sao->type_idx[1];
-+ sao->eo_class[2] = sao->eo_class[1];
-+ } else {
-+ sao->type_idx[c_idx] = ff_hevc_rpi_sao_type_idx_decode(lc);
-+ }
-+
-+ // ** Could use BY22 here quite plausibly - this is all bypass stuff
-+ // though only per CTB so not very timing critical
-+
-+ if (sao->type_idx[c_idx] == SAO_NOT_APPLIED)
-+ continue;
-+
-+ for (i = 0; i < 4; i++)
-+ offset_abs[i] = ff_hevc_rpi_sao_offset_abs_decode(s, lc);
-+
-+ if (sao->type_idx[c_idx] == SAO_BAND) {
-+ for (i = 0; i < 4; i++) {
-+ if (offset_abs[i] != 0)
-+ offset_sign[i] = ff_hevc_rpi_sao_offset_sign_decode(lc);
-+ }
-+ sao->band_position[c_idx] = ff_hevc_rpi_sao_band_position_decode(lc);
-+ } else if (c_idx != 2) {
-+ sao->eo_class[c_idx] = ff_hevc_rpi_sao_eo_class_decode(lc);
-+ }
-+
-+ // Inferred parameters
-+ sao->offset_val[c_idx][0] = 0;
-+ for (i = 0; i < 4; i++) {
-+ sao->offset_val[c_idx][i + 1] = offset_abs[i] << log2_sao_offset_scale;
-+ if (sao->type_idx[c_idx] == SAO_EDGE) {
-+ if (i > 1)
-+ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+ } else if (offset_sign[i]) {
-+ sao->offset_val[c_idx][i + 1] = -sao->offset_val[c_idx][i + 1];
-+ }
-+ }
-+ }
-+}
-+
-+#if 0
-+static int hls_cross_component_pred(HEVCRpiLocalContext * const lc, const int idx) {
-+ int log2_res_scale_abs_plus1 = ff_hevc_rpi_log2_res_scale_abs(lc, idx); // 0..4
-+
-+ if (log2_res_scale_abs_plus1 != 0) {
-+ int res_scale_sign_flag = ff_hevc_rpi_res_scale_sign_flag(lc, idx);
-+ lc->tu.res_scale_val = (1 << (log2_res_scale_abs_plus1 - 1)) *
-+ (1 - 2 * res_scale_sign_flag);
-+ } else {
-+ lc->tu.res_scale_val = 0;
-+ }
-+
-+
-+ return 0;
-+}
-+#endif
-+
-+static inline HEVCPredCmd * rpi_new_intra_cmd(HEVCRpiJob * const jb)
-+{
-+ return jb->intra.cmds + jb->intra.n++;
-+}
-+
-+#define A0(x, y, U, L, UL, UR, DL) \
-+ [(x)+(y)*16] = (((U) ? AVAIL_U : 0) | ((L) ? AVAIL_L : 0) | ((UL) ? AVAIL_UL : 0) | ((UR) ? AVAIL_UR : 0) | ((DL) ? AVAIL_DL : 0))
-+
-+#define A1(x, y, U, L, UL, UR, DL) \
-+ A0((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A0((x) + 1, (y) + 0, (U), 1, (U), (UR), 0 ),\
-+ A0((x) + 0, (y) + 1, 1, (L), (L), 1, (DL)), A0((x) + 1, (y) + 1, 1, 1, 1, 0, 0 )
-+
-+#define A2(x, y, U, L, UL, UR, DL) \
-+ A1((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A1((x) + 2, (y) + 0, (U), 1, (U), (UR), 0 ),\
-+ A1((x) + 0, (y) + 2, 1, (L), (L), 1, (DL)), A1((x) + 2, (y) + 2, 1, 1, 1, 0, 0 )
-+
-+#define A3(x, y, U, L, UL, UR, DL) \
-+ A2((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A2((x) + 4, (y) + 0, (U), 1, (U), (UR), 0 ),\
-+ A2((x) + 0, (y) + 4, 1, (L), (L), 1, (DL)), A2((x) + 4, (y) + 4, 1, 1, 1, 0, 0 )
-+
-+#define A4(x, y, U, L, UL, UR, DL) \
-+ A3((x) + 0, (y) + 0, (U), (L), (UL), (U), (L) ), A3((x) + 8, (y) + 0, (U), 1, (U), (UR), 0 ),\
-+ A3((x) + 0, (y) + 8, 1, (L), (L), 1, (DL)), A3((x) + 8, (y) + 8, 1, 1, 1, 0, 0 )
-+
-+static const uint8_t tb_flags[16 * 16] = {A4(0, 0, 0, 0, 0, 0, 0)};
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h)
-+{
-+ const unsigned int ctb_mask = ~0U << s->ps.sps->log2_ctb_size;
-+ const unsigned int tb_x = x & ~ctb_mask;
-+ const unsigned int tb_y = y & ~ctb_mask;
-+ const unsigned int ctb_avail = lc->ctb_avail;
-+
-+ const uint8_t * const tb_f = tb_flags + (tb_x >> 2) + (tb_y >> 2) * 16;
-+
-+ unsigned int f = (ctb_avail | tb_f[0]) & (AVAIL_L | AVAIL_U | AVAIL_UL);
-+
-+ // This deals with both the U & L edges
-+ if ((tb_x | tb_y) != 0 && (~f & (AVAIL_L | AVAIL_U)) == 0)
-+ f |= AVAIL_UL;
-+
-+ if (x + w < lc->end_of_ctb_x)
-+ f |= (tb_y == 0 ? ctb_avail >> (AVAIL_S_U - AVAIL_S_UR) : tb_f[(w - 1) >> 2]) & AVAIL_UR;
-+ else if (tb_y == 0)
-+ f |= (ctb_avail & AVAIL_UR);
-+#if AVAIL_S_U - AVAIL_S_UR < 0
-+#error Shift problem
-+#endif
-+
-+ // Never any D if Y beyond eoctb
-+ if (y + h < lc->end_of_ctb_y)
-+ f |= (tb_x == 0 ? ctb_avail << (AVAIL_S_DL - AVAIL_S_L) : tb_f[((h - 1) >> 2) * 16]) & AVAIL_DL;
-+#if AVAIL_S_DL - AVAIL_S_L < 0
-+#error Shift problem
-+#endif
-+
-+// printf("(%#x, %#x): %dx%d ca=%02x, ful=%02x, ftr=%02x, fdl=%02x, eox=%#x, eoy=%#x\n", x, y, w, h,
-+// lc->ctb_avail, tb_f[0], tb_f[(w - 1) >> 2], tb_f[((h - 1) >> 2) * 16],
-+// lc->end_of_ctb_x, lc->end_of_ctb_y);
-+
-+ return f;
-+}
-+
-+#undef A0
-+#undef A1
-+#undef A2
-+#undef A3
-+#undef A4
-+
-+static void do_intra_pred(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int log2_trafo_size, int x0, int y0, int c_idx,
-+ unsigned int avail)
-+{
-+ // If rpi_enabled then sand - U & V done on U call
-+ if (c_idx <= 1)
-+ {
-+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+ cmd->type = RPI_PRED_INTRA + c_idx;
-+ cmd->size = log2_trafo_size;
-+ cmd->avail = avail;
-+ cmd->i_pred.x = x0;
-+ cmd->i_pred.y = y0;
-+ cmd->i_pred.mode = c_idx ? lc->tu.intra_pred_mode_c : lc->tu.intra_pred_mode;
-+
-+// printf("(%#x, %#x) c_idx=%d, s=%d, a=%#x\n", x0, y0, c_idx, 1 << log2_trafo_size, avail);
-+ }
-+}
-+
-+#define CBF_CB0_S 0
-+#define CBF_CB1_S 1 // CB1 must be CB0 + 1
-+#define CBF_CR0_S 2
-+#define CBF_CR1_S 3
-+
-+#define CBF_CB0 (1 << CBF_CB0_S)
-+#define CBF_CR0 (1 << CBF_CR0_S)
-+#define CBF_CB1 (1 << CBF_CB1_S)
-+#define CBF_CR1 (1 << CBF_CR1_S)
-+
-+// * Only good for chroma_idx == 1
-+static int hls_transform_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_cb_size, const unsigned int log2_trafo_size,
-+ const unsigned int blk_idx, const int cbf_luma,
-+ const unsigned int const cbf_chroma)
-+{
-+ const unsigned int log2_trafo_size_c = FFMAX(2, log2_trafo_size - 1);
-+ const unsigned int x0_c = x0 & ~7;
-+ const unsigned int y0_c = y0 & ~7;
-+
-+ enum ScanType scan_idx = SCAN_DIAG;
-+ enum ScanType scan_idx_c = SCAN_DIAG;
-+
-+ if (lc->cu.pred_mode == MODE_INTRA)
-+ {
-+ const unsigned int trafo_size = 1 << log2_trafo_size;
-+ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, trafo_size, trafo_size);
-+
-+ do_intra_pred(s, lc, log2_trafo_size, x0, y0, 0, avail);
-+
-+ if (log2_trafo_size > 2)
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1, avail);
-+ else if (blk_idx == 3)
-+ do_intra_pred(s, lc, log2_trafo_size_c, x0_c, y0_c, 1,
-+ ff_hevc_rpi_tb_avail_flags(s, lc, x0_c, y0_c, 8, 8));
-+
-+ if (log2_trafo_size < 4) {
-+ if (lc->tu.intra_pred_mode >= 6 &&
-+ lc->tu.intra_pred_mode <= 14) {
-+ scan_idx = SCAN_VERT;
-+ } else if (lc->tu.intra_pred_mode >= 22 &&
-+ lc->tu.intra_pred_mode <= 30) {
-+ scan_idx = SCAN_HORIZ;
-+ }
-+
-+ if (lc->tu.intra_pred_mode_c >= 6 &&
-+ lc->tu.intra_pred_mode_c <= 14) {
-+ scan_idx_c = SCAN_VERT;
-+ } else if (lc->tu.intra_pred_mode_c >= 22 &&
-+ lc->tu.intra_pred_mode_c <= 30) {
-+ scan_idx_c = SCAN_HORIZ;
-+ }
-+ }
-+ }
-+
-+ if (!cbf_luma && cbf_chroma == 0)
-+ return 0;
-+
-+ if (lc->tu.is_cu_qp_delta_wanted)
-+ {
-+ const int qp_delta = ff_hevc_rpi_cu_qp_delta(lc);
-+ const unsigned int cb_mask = ~0U << log2_cb_size;
-+
-+ if (qp_delta < -(26 + (s->ps.sps->qp_bd_offset >> 1)) ||
-+ qp_delta > (25 + (s->ps.sps->qp_bd_offset >> 1)))
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "The cu_qp_delta %d is outside the valid range "
-+ "[%d, %d].\n",
-+ qp_delta,
-+ -(26 + (s->ps.sps->qp_bd_offset >> 1)),
-+ (25 + (s->ps.sps->qp_bd_offset >> 1)));
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ lc->tu.is_cu_qp_delta_wanted = 0;
-+ lc->tu.cu_qp_delta = qp_delta;
-+ ff_hevc_rpi_set_qPy(s, lc, x0 & cb_mask, y0 & cb_mask);
-+ }
-+
-+ // * Not main profile & untested due to no conform streams
-+ if (lc->tu.cu_chroma_qp_offset_wanted && cbf_chroma &&
-+ !lc->cu.cu_transquant_bypass_flag) {
-+ int cu_chroma_qp_offset_flag = ff_hevc_rpi_cu_chroma_qp_offset_flag(lc);
-+ if (cu_chroma_qp_offset_flag) {
-+ int cu_chroma_qp_offset_idx = 0;
-+ if (s->ps.pps->chroma_qp_offset_list_len_minus1 > 0) {
-+ cu_chroma_qp_offset_idx = ff_hevc_rpi_cu_chroma_qp_offset_idx(s, lc);
-+ }
-+ lc->tu.qp_divmod6[1] += s->ps.pps->cb_qp_offset_list[cu_chroma_qp_offset_idx];
-+ lc->tu.qp_divmod6[2] += s->ps.pps->cr_qp_offset_list[cu_chroma_qp_offset_idx];
-+ }
-+ lc->tu.cu_chroma_qp_offset_wanted = 0;
-+ }
-+
-+ if (cbf_luma)
-+ ff_hevc_rpi_hls_residual_coding(s, lc, x0, y0, log2_trafo_size, scan_idx, 0);
-+
-+ if (log2_trafo_size > 2 || blk_idx == 3)
-+ {
-+ if ((cbf_chroma & CBF_CB0) != 0)
-+ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+ log2_trafo_size_c, scan_idx_c, 1);
-+ if ((cbf_chroma & CBF_CR0) != 0)
-+ ff_hevc_rpi_hls_residual_coding(s, lc, x0_c, y0_c,
-+ log2_trafo_size_c, scan_idx_c, 2);
-+ }
-+
-+ return 0;
-+}
-+
-+static inline void set_deblocking_bypass(const HEVCRpiContext * const s, const int x0, const int y0, const int log2_cb_size)
-+{
-+ set_bits(s->is_pcm + (y0 >> 3) * s->ps.sps->pcm_width, x0 >> 3, s->ps.sps->pcm_width, log2_cb_size - 3);
-+}
-+
-+
-+static int hls_transform_tree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_trafo_size,
-+ const unsigned int trafo_depth, const unsigned int blk_idx,
-+ const unsigned int cbf_c0)
-+{
-+ // When trafo_size == 2 hls_transform_unit uses c0 so put in c1
-+ unsigned int cbf_c1 = cbf_c0;
-+ int split_transform_flag;
-+ int ret;
-+
-+ if (lc->cu.intra_split_flag) {
-+ if (trafo_depth == 1) {
-+ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[blk_idx];
-+ if (ctx_cfmt(s) == 3) {
-+ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[blk_idx];
-+ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[blk_idx];
-+ } else {
-+ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
-+ }
-+ }
-+ } else {
-+ lc->tu.intra_pred_mode = lc->pu.intra_pred_mode[0];
-+ lc->tu.intra_pred_mode_c = lc->pu.intra_pred_mode_c[0];
-+ lc->tu.chroma_mode_c = lc->pu.chroma_mode_c[0];
-+ }
-+
-+ if (log2_trafo_size <= s->ps.sps->log2_max_trafo_size &&
-+ log2_trafo_size > s->ps.sps->log2_min_tb_size &&
-+ trafo_depth < lc->cu.max_trafo_depth &&
-+ !(lc->cu.intra_split_flag && trafo_depth == 0))
-+ {
-+ split_transform_flag = ff_hevc_rpi_split_transform_flag_decode(lc, log2_trafo_size);
-+ } else {
-+ int inter_split = s->ps.sps->max_transform_hierarchy_depth_inter == 0 &&
-+ lc->cu.pred_mode == MODE_INTER &&
-+ lc->cu.part_mode != PART_2Nx2N &&
-+ trafo_depth == 0;
-+
-+ split_transform_flag = log2_trafo_size > s->ps.sps->log2_max_trafo_size ||
-+ (lc->cu.intra_split_flag && trafo_depth == 0) ||
-+ inter_split;
-+ }
-+
-+ if (log2_trafo_size > 2 || ctx_cfmt(s) == 3)
-+ {
-+ const int wants_c1 = ctx_cfmt(s) == 2 && (!split_transform_flag || log2_trafo_size == 3);
-+ cbf_c1 = 0;
-+
-+ if ((cbf_c0 & CBF_CB0) != 0)
-+ {
-+ cbf_c1 = ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB0_S;
-+ if (wants_c1)
-+ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CB1_S;
-+ }
-+
-+ if ((cbf_c0 & CBF_CR0) != 0)
-+ {
-+ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR0_S;
-+ if (wants_c1)
-+ cbf_c1 |= ff_hevc_rpi_cbf_cb_cr_decode(lc, trafo_depth) << CBF_CR1_S;
-+ }
-+ }
-+
-+ if (split_transform_flag) {
-+ const int trafo_size_split = 1 << (log2_trafo_size - 1);
-+ const int x1 = x0 + trafo_size_split;
-+ const int y1 = y0 + trafo_size_split;
-+
-+#define SUBDIVIDE(x, y, idx) \
-+do { \
-+ ret = hls_transform_tree(s, lc, x, y, \
-+ log2_trafo_size - 1, trafo_depth + 1, idx, \
-+ cbf_c1); \
-+ if (ret < 0) \
-+ return ret; \
-+} while (0)
-+
-+ SUBDIVIDE(x0, y0, 0);
-+ SUBDIVIDE(x1, y0, 1);
-+ SUBDIVIDE(x0, y1, 2);
-+ SUBDIVIDE(x1, y1, 3);
-+
-+#undef SUBDIVIDE
-+ } else {
-+ // If trafo_size == 2 then we should have cbf_c == 0 here but as we can't have
-+ // trafo_size == 2 with depth == 0 the issue is moot
-+ const int cbf_luma = ((lc->cu.pred_mode != MODE_INTRA && trafo_depth == 0 && cbf_c1 == 0) ||
-+ ff_hevc_rpi_cbf_luma_decode(lc, trafo_depth));
-+
-+ ret = hls_transform_unit(s, lc, x0, y0,
-+ log2_trafo_size + trafo_depth, log2_trafo_size,
-+ blk_idx, cbf_luma, cbf_c1);
-+ if (ret < 0)
-+ return ret;
-+
-+ if (!s->sh.disable_deblocking_filter_flag) {
-+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_trafo_size, cbf_luma);
-+ }
-+ }
-+ return 0;
-+}
-+
-+
-+static int pcm_extract(const HEVCRpiContext * const s, const uint8_t * pcm, const int length, const int x0, const int y0, const int cb_size)
-+{
-+ GetBitContext gb;
-+ int ret;
-+
-+ ret = init_get_bits(&gb, pcm, length);
-+ if (ret < 0)
-+ return ret;
-+
-+ s->hevcdsp.put_pcm(av_rpi_sand_frame_pos_y(s->frame, x0, y0),
-+ frame_stride1(s->frame, 0),
-+ cb_size, cb_size, &gb, s->ps.sps->pcm.bit_depth);
-+
-+ s->hevcdsp.put_pcm_c(av_rpi_sand_frame_pos_c(s->frame, x0 >> ctx_hshift(s, 1), y0 >> ctx_vshift(s, 1)),
-+ s->frame->linesize[1],
-+ cb_size >> ctx_hshift(s, 1),
-+ cb_size >> ctx_vshift(s, 1),
-+ &gb, s->ps.sps->pcm.bit_depth_chroma);
-+
-+ return 0;
-+}
-+
-+
-+// x * 2^(y*2)
-+static inline unsigned int xyexp2(const unsigned int x, const unsigned int y)
-+{
-+ return x << (y * 2);
-+}
-+
-+static int hls_pcm_sample(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0, unsigned int log2_cb_size)
-+{
-+ // Length in bits
-+ const unsigned int length = xyexp2(s->ps.sps->pcm.bit_depth, log2_cb_size) +
-+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 1)) +
-+ xyexp2(s->ps.sps->pcm.bit_depth_chroma, log2_cb_size - ctx_vshift(s, 2));
-+
-+ const uint8_t * const pcm = ff_hevc_rpi_cabac_skip_bytes(&lc->cc, (length + 7) >> 3);
-+
-+ if (!s->sh.disable_deblocking_filter_flag)
-+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+
-+ // Copy coeffs
-+ {
-+ const int blen = (length + 7) >> 3;
-+ // Round allocated bytes up to nearest 32 to avoid alignment confusion
-+ // Allocation is in int16_t s
-+ // As we are only using 1 byte per sample and the coeff buffer allows 2 per
-+ // sample this rounding doesn't affect the total size we need to allocate for
-+ // the coeff buffer
-+ int16_t * const coeffs = rpi_alloc_coeff_buf(lc->jb0, 0, ((blen + 31) & ~31) >> 1);
-+ memcpy(coeffs, pcm, blen);
-+
-+ // Our coeff stash assumes that any partially allocated 64byte lump
-+ // is zeroed so make that true.
-+ {
-+ uint8_t * const eopcm = (uint8_t *)coeffs + blen;
-+ if ((-(intptr_t)eopcm & 63) != 0)
-+ memset(eopcm, 0, -(intptr_t)eopcm & 63);
-+ }
-+
-+ // Add command
-+ {
-+ HEVCPredCmd *const cmd = rpi_new_intra_cmd(lc->jb0);
-+ cmd->type = RPI_PRED_I_PCM;
-+ cmd->size = log2_cb_size;
-+ cmd->i_pcm.src = coeffs;
-+ cmd->i_pcm.x = x0;
-+ cmd->i_pcm.y = y0;
-+ cmd->i_pcm.src_len = length;
-+ }
-+ return 0;
-+ }
-+}
-+
-+
-+static void hevc_await_progress(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const HEVCRpiFrame * const ref,
-+ const MvXY xy, const int y0, const int height)
-+{
-+ if (s->threads_type != 0) {
-+ const int y = FFMAX(0, (MV_Y(xy) >> 2) + y0 + height + 9);
-+
-+ // Progress has to be attached to current job as the actual wait
-+ // is in worker_core which can't use lc
-+ int16_t *const pr = lc->jb0->progress_req + ref->dpb_no;
-+ if (*pr < y) {
-+ *pr = y;
-+ }
-+ }
-+}
-+
-+static void hevc_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const int x0, const int y0, const int nPbW,
-+ const int nPbH,
-+ HEVCRpiMvField * const mv)
-+{
-+ enum InterPredIdc inter_pred_idc = PRED_L0;
-+ int mvp_flag;
-+ const unsigned int avail = ff_hevc_rpi_tb_avail_flags(s, lc, x0, y0, nPbW, nPbH);
-+
-+ mv->pred_flag = 0;
-+ if (s->sh.slice_type == HEVC_SLICE_B)
-+ inter_pred_idc = ff_hevc_rpi_inter_pred_idc_decode(lc, nPbW, nPbH);
-+
-+ if (inter_pred_idc != PRED_L1) {
-+ MvXY mvd;
-+
-+ if (s->sh.nb_refs[L0])
-+ mv->ref_idx[0]= ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L0]);
-+
-+ mv->pred_flag = PF_L0;
-+ mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+ mv, mvp_flag, 0);
-+ mv->xy[0] = mvxy_add(mv->xy[0], mvd);
-+ }
-+
-+ if (inter_pred_idc != PRED_L0) {
-+ MvXY mvd = 0;
-+
-+ if (s->sh.nb_refs[L1])
-+ mv->ref_idx[1] = ff_hevc_rpi_ref_idx_lx_decode(lc, s->sh.nb_refs[L1]);
-+
-+ if (s->sh.mvd_l1_zero_flag != 1 || inter_pred_idc != PRED_BI)
-+ mvd = ff_hevc_rpi_hls_mvd_coding(lc);
-+
-+ mv->pred_flag += PF_L1;
-+ mvp_flag = ff_hevc_rpi_mvp_lx_flag_decode(lc);
-+ ff_hevc_rpi_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, avail,
-+ mv, mvp_flag, 1);
-+ mv->xy[1] = mvxy_add(mv->xy[1], mvd);
-+ }
-+}
-+
-+
-+static HEVCRpiInterPredQ *
-+rpi_nxt_pred(HEVCRpiInterPredEnv * const ipe, const unsigned int load_val, const uint32_t fn)
-+{
-+ HEVCRpiInterPredQ * yp = NULL;
-+ HEVCRpiInterPredQ * ypt = ipe->q + ipe->curr;
-+ const unsigned int max_fill = ipe->max_fill;
-+ unsigned int load = UINT_MAX;
-+
-+ for (unsigned int i = 0; i != ipe->n_grp; ++i, ++ypt) {
-+ // We will always have enough room between the Qs but if we are
-+ // running critically low due to poor scheduling then use fill size
-+ // rather than load to determine QPU. This has obvious dire
-+ // performance implications but (a) it is better than crashing
-+ // and (b) it should (almost) never happen
-+ const unsigned int tfill = (char *)ypt->qpu_mc_curr - (char *)ypt->qpu_mc_base;
-+ const unsigned int tload = tfill > max_fill ? tfill + 0x1000000 : ypt->load;
-+
-+ if (tload < load)
-+ {
-+ yp = ypt;
-+ load = tload;
-+ }
-+ }
-+
-+ yp->load += load_val;
-+ ipe->used_grp = 1;
-+ yp->qpu_mc_curr->data[-1] = fn; // Link is always last el of previous cmd
-+
-+ return yp;
-+}
-+
-+
-+static void rpi_inter_pred_sync(HEVCRpiInterPredEnv * const ipe)
-+{
-+ for (unsigned int i = 0; i != ipe->n; ++i) {
-+ HEVCRpiInterPredQ * const q = ipe->q + i;
-+ const unsigned int qfill = (char *)q->qpu_mc_curr - (char *)q->qpu_mc_base;
-+
-+ q->qpu_mc_curr->data[-1] = q->code_sync;
-+ q->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(q->qpu_mc_curr->data + 1);
-+ q->load = (qfill >> 7); // Have a mild preference for emptier Qs to balance memory usage
-+ }
-+}
-+
-+// Returns 0 on success
-+// We no longer check for Q fullness as wew have emergncy code in ctu alloc
-+// * However it might be an idea to have some means of spotting that we've used it
-+static int rpi_inter_pred_next_ctu(HEVCRpiInterPredEnv * const ipe)
-+{
-+ if (!ipe->used_grp)
-+ return 0;
-+
-+ if ((ipe->curr += ipe->n_grp) >= ipe->n)
-+ {
-+ ipe->curr = 0;
-+ rpi_inter_pred_sync(ipe);
-+ }
-+ ipe->used = 1;
-+ ipe->used_grp = 0;
-+
-+ return 0;
-+}
-+
-+static void rpi_inter_pred_reset(HEVCRpiInterPredEnv * const ipe)
-+{
-+ unsigned int i;
-+
-+ ipe->curr = 0;
-+ ipe->used = 0;
-+ ipe->used_grp = 0;
-+ for (i = 0; i != ipe->n; ++i) {
-+ HEVCRpiInterPredQ * const q = ipe->q + i;
-+ q->qpu_mc_curr = q->qpu_mc_base;
-+ q->load = 0;
-+ q->last_l0 = NULL;
-+ q->last_l1 = NULL;
-+ }
-+}
-+
-+static void rpi_inter_pred_alloc(HEVCRpiInterPredEnv * const ipe,
-+ const unsigned int n_max, const unsigned int n_grp,
-+ const unsigned int total_size, const unsigned int min_gap)
-+{
-+ memset(ipe, 0, sizeof(*ipe));
-+ av_assert0((ipe->q = av_mallocz(n_max * sizeof(*ipe->q))) != NULL);
-+ ipe->n_grp = n_grp;
-+ ipe->min_gap = min_gap;
-+
-+ gpu_malloc_cached(total_size, &ipe->gptr);
-+}
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define get_mc_address_y(f) ((f)->data[0])
-+#else
-+#define get_mc_address_y(f) get_vc_address_y(f)
-+#endif
-+#if RPI_QPU_EMU_C
-+#define get_mc_address_u(f) ((f)->data[1])
-+#else
-+#define get_mc_address_u(f) get_vc_address_u(f)
-+#endif
-+
-+static inline uint32_t pack_wo_p(const int off, const int mul)
-+{
-+ return PACK2(off * 2 + 1, mul);
-+}
-+
-+static inline uint32_t pack_wo_b(const int off0, const int off1, const int mul)
-+{
-+ return PACK2(off0 + off1 + 1, mul);
-+}
-+
-+
-+static void
-+rpi_pred_y(const HEVCRpiContext *const s, HEVCRpiJob * const jb,
-+ const int x0, const int y0,
-+ const int nPbW, const int nPbH,
-+ const MvXY mv_xy,
-+ const int weight_mul,
-+ const int weight_offset,
-+ AVFrame *const src_frame)
-+{
-+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+ const unsigned int mx = MV_X(mv_xy) & 3;
-+ const unsigned int my = MV_Y(mv_xy) & 3;
-+ const unsigned int my_mx = (my << 8) | mx;
-+ const uint32_t my2_mx2_my_mx = (my_mx << 16) | my_mx;
-+ const qpu_mc_src_addr_t src_vc_address_y = get_mc_address_y(src_frame);
-+ qpu_mc_dst_addr_t dst_addr = get_mc_address_y(s->frame) + y_off;
-+ const uint32_t wo = pack_wo_p(weight_offset, weight_mul);
-+ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+
-+ if (my_mx == 0)
-+ {
-+ const int x1 = x0 + (MV_X(mv_xy) >> 2);
-+ const int y1 = y0 + (MV_Y(mv_xy) >> 2);
-+ const int bh = nPbH;
-+
-+ for (int start_x = 0; start_x < nPbW; start_x += 16)
-+ {
-+ const int bw = FFMIN(nPbW - start_x, 16);
-+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_p00);
-+ qpu_mc_src_t *const src1 = yp->last_l0;
-+ qpu_mc_pred_y_p00_t *const cmd_y = &yp->qpu_mc_curr->y.p00;
-+
-+#if RPI_TSTATS
-+ {
-+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+ ++ts->y_pred1_x0y0;
-+
-+ if (nPbW > 8)
-+ ++ts->y_pred1_wgt8;
-+ else
-+ ++ts->y_pred1_wle8;
-+
-+ if (nPbH > 16)
-+ ++ts->y_pred1_hgt16;
-+ else
-+ ++ts->y_pred1_hle16;
-+ }
-+#endif
-+
-+ src1->x = x1 + start_x;
-+ src1->y = y1;
-+ src1->base = src_vc_address_y;
-+ cmd_y->w = bw;
-+ cmd_y->h = bh;
-+ cmd_y->wo1 = wo;
-+ cmd_y->dst_addr = dst_addr + (start_x << xshl);
-+ yp->last_l0 = &cmd_y->next_src1;
-+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+ }
-+ }
-+ else
-+ {
-+ const int x1_m3 = x0 + (MV_X(mv_xy) >> 2) - 3;
-+ const int y1_m3 = y0 + (MV_Y(mv_xy) >> 2) - 3;
-+ const unsigned int bh = nPbH;
-+ int start_x = 0;
-+
-+#if 1
-+ // As Y-pred operates on two independant 8-wide src blocks we can merge
-+ // this pred with the previous one if it the previous one is 8 pel wide,
-+ // the same height as the current block, immediately to the left of our
-+ // current dest block and mono-pred.
-+
-+ qpu_mc_pred_y_p_t *const last_y8_p = jb->last_y8_p;
-+ if (last_y8_p != NULL && last_y8_p->h == bh && last_y8_p->dst_addr + (8 << xshl) == dst_addr)
-+ {
-+ const int bw = FFMIN(nPbW, 8);
-+ qpu_mc_src_t *const last_y8_src2 = jb->last_y8_l1;
-+
-+ last_y8_src2->x = x1_m3;
-+ last_y8_src2->y = y1_m3;
-+ last_y8_src2->base = src_vc_address_y;
-+ last_y8_p->w += bw;
-+ last_y8_p->mymx21 = PACK2(my2_mx2_my_mx, last_y8_p->mymx21);
-+ last_y8_p->wo2 = wo;
-+
-+ jb->last_y8_p = NULL;
-+ jb->last_y8_l1 = NULL;
-+ start_x = bw;
-+#if RPI_TSTATS
-+ ++((HEVCRpiStats *)&s->tstats)->y_pred1_y8_merge;
-+#endif
-+ }
-+#endif
-+
-+ for (; start_x < nPbW; start_x += 16)
-+ {
-+ const int bw = FFMIN(nPbW - start_x, 16);
-+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_pxx);
-+ qpu_mc_src_t *const src1 = yp->last_l0;
-+ qpu_mc_src_t *const src2 = yp->last_l1;
-+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+ {
-+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+ if (mx == 0 && my == 0)
-+ ++ts->y_pred1_x0y0;
-+ else if (mx == 0)
-+ ++ts->y_pred1_x0;
-+ else if (my == 0)
-+ ++ts->y_pred1_y0;
-+ else
-+ ++ts->y_pred1_xy;
-+
-+ if (nPbW > 8)
-+ ++ts->y_pred1_wgt8;
-+ else
-+ ++ts->y_pred1_wle8;
-+
-+ if (nPbH > 16)
-+ ++ts->y_pred1_hgt16;
-+ else
-+ ++ts->y_pred1_hle16;
-+ }
-+#endif
-+ src1->x = x1_m3 + start_x;
-+ src1->y = y1_m3;
-+ src1->base = src_vc_address_y;
-+ if (bw <= 8)
-+ {
-+ src2->x = MC_DUMMY_X;
-+ src2->y = MC_DUMMY_Y;
-+#if RPI_QPU_EMU_Y
-+ src2->base = s->qpu_dummy_frame_emu;
-+#else
-+ src2->base = s->qpu_dummy_frame_qpu;
-+#endif
-+ }
-+ else
-+ {
-+ src2->x = x1_m3 + start_x + 8;
-+ src2->y = y1_m3;
-+ src2->base = src_vc_address_y;
-+ }
-+ cmd_y->w = bw;
-+ cmd_y->h = bh;
-+ cmd_y->mymx21 = my2_mx2_my_mx;
-+ cmd_y->wo1 = wo;
-+ cmd_y->wo2 = wo;
-+ cmd_y->dst_addr = dst_addr + (start_x << xshl);
-+ yp->last_l0 = &cmd_y->next_src1;
-+ yp->last_l1 = &cmd_y->next_src2;
-+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+
-+ if (bw == 8) {
-+ jb->last_y8_l1 = src2;
-+ jb->last_y8_p = cmd_y;
-+ }
-+ }
-+ }
-+}
-+
-+static void
-+rpi_pred_y_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const int x0, const int y0,
-+ const int nPbW, const int nPbH,
-+ const struct HEVCRpiMvField *const mv_field,
-+ const AVFrame *const src_frame,
-+ const AVFrame *const src_frame2)
-+{
-+ const unsigned int y_off = av_rpi_sand_frame_off_y(s->frame, x0, y0);
-+ const MvXY const mv = mv_field->xy[0];
-+ const MvXY const mv2 = mv_field->xy[1];
-+
-+ const unsigned int mx = MV_X(mv) & 3;
-+ const unsigned int my = MV_Y(mv) & 3;
-+ const unsigned int my_mx = (my<<8) | mx;
-+ const unsigned int mx2 = MV_X(mv2) & 3;
-+ const unsigned int my2 = MV_Y(mv2) & 3;
-+ const unsigned int my2_mx2 = (my2<<8) | mx2;
-+ const uint32_t my2_mx2_my_mx = (my2_mx2 << 16) | my_mx;
-+ const unsigned int ref_idx0 = mv_field->ref_idx[0];
-+ const unsigned int ref_idx1 = mv_field->ref_idx[1];
-+ const uint32_t wo1 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l0[ref_idx0]);
-+ const uint32_t wo2 = pack_wo_b(s->sh.luma_offset_l0[ref_idx0], s->sh.luma_offset_l1[ref_idx1], s->sh.luma_weight_l1[ref_idx1]);
-+
-+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame);
-+ qpu_mc_dst_addr_t dst = get_mc_address_y(s->frame) + y_off;
-+ const qpu_mc_src_addr_t src1_base = get_mc_address_y(src_frame);
-+ const qpu_mc_src_addr_t src2_base = get_mc_address_y(src_frame2);
-+ HEVCRpiInterPredEnv * const ipe = &jb->luma_ip;
-+
-+ if (my2_mx2_my_mx == 0)
-+ {
-+ const int x1 = x0 + (MV_X(mv) >> 2);
-+ const int y1 = y0 + (MV_Y(mv) >> 2);
-+ const int x2 = x0 + (MV_X(mv2) >> 2);
-+ const int y2 = y0 + (MV_Y(mv2) >> 2);
-+ const int bh = nPbH;
-+
-+ // Can do chunks a full 16 wide if we don't want the H filter
-+ for (int start_x=0; start_x < nPbW; start_x += 16)
-+ {
-+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh, s->qpu.y_b00);
-+ qpu_mc_src_t *const src1 = yp->last_l0;
-+ qpu_mc_src_t *const src2 = yp->last_l1;
-+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+ {
-+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+ ++ts->y_pred2_x0y0;
-+
-+ if (nPbH > 16)
-+ ++ts->y_pred2_hgt16;
-+ else
-+ ++ts->y_pred2_hle16;
-+ }
-+#endif
-+ src1->x = x1 + start_x;
-+ src1->y = y1;
-+ src1->base = src1_base;
-+ src2->x = x2 + start_x;
-+ src2->y = y2;
-+ src2->base = src2_base;
-+ cmd_y->w = FFMIN(nPbW - start_x, 16);
-+ cmd_y->h = bh;
-+ cmd_y->mymx21 = 0;
-+ cmd_y->wo1 = wo1;
-+ cmd_y->wo2 = wo2;
-+ cmd_y->dst_addr = dst + (start_x << xshl);
-+ yp->last_l0 = &cmd_y->next_src1;
-+ yp->last_l1 = &cmd_y->next_src2;
-+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+ }
-+ }
-+ else
-+ {
-+ // Filter requires a run-up of 3
-+ const int x1 = x0 + (MV_X(mv) >> 2) - 3;
-+ const int y1 = y0 + (MV_Y(mv) >> 2) - 3;
-+ const int x2 = x0 + (MV_X(mv2) >> 2) - 3;
-+ const int y2 = y0 + (MV_Y(mv2) >> 2) - 3;
-+ const int bh = nPbH;
-+
-+ for (int start_x=0; start_x < nPbW; start_x += 8)
-+ { // B blocks work 8 at a time
-+ // B weights aren't doubled as the QPU code does the same
-+ // amount of work as it does for P
-+ HEVCRpiInterPredQ *const yp = rpi_nxt_pred(ipe, bh + 7, s->qpu.y_bxx);
-+ qpu_mc_src_t *const src1 = yp->last_l0;
-+ qpu_mc_src_t *const src2 = yp->last_l1;
-+ qpu_mc_pred_y_p_t *const cmd_y = &yp->qpu_mc_curr->y.p;
-+#if RPI_TSTATS
-+ {
-+ HEVCRpiStats *const ts = (HEVCRpiStats *)&s->tstats;
-+ const unsigned int mmx = mx | mx2;
-+ const unsigned int mmy = my | my2;
-+ if (mmx == 0 && mmy == 0)
-+ ++ts->y_pred2_x0y0;
-+ else if (mmx == 0)
-+ ++ts->y_pred2_x0;
-+ else if (mmy == 0)
-+ ++ts->y_pred2_y0;
-+ else
-+ ++ts->y_pred2_xy;
-+
-+ if (nPbH > 16)
-+ ++ts->y_pred2_hgt16;
-+ else
-+ ++ts->y_pred2_hle16;
-+ }
-+#endif
-+ src1->x = x1 + start_x;
-+ src1->y = y1;
-+ src1->base = src1_base;
-+ src2->x = x2 + start_x;
-+ src2->y = y2;
-+ src2->base = src2_base;
-+ cmd_y->w = FFMIN(nPbW - start_x, 8);
-+ cmd_y->h = bh;
-+ cmd_y->mymx21 = my2_mx2_my_mx;
-+ cmd_y->wo1 = wo1;
-+ cmd_y->wo2 = wo2;
-+ cmd_y->dst_addr = dst + (start_x << xshl);
-+ yp->last_l0 = &cmd_y->next_src1;
-+ yp->last_l1 = &cmd_y->next_src2;
-+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_y + 1);
-+ }
-+ }
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const unsigned int lx, const int x0_c, const int y0_c,
-+ const int nPbW_c, const int nPbH_c,
-+ const MvXY const mv,
-+ const int16_t * const c_weights,
-+ const int16_t * const c_offsets,
-+ AVFrame * const src_frame)
-+{
-+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+ const int hshift = 1; // = s->ps.sps->hshift[1];
-+ const int vshift = 1; // = s->ps.sps->vshift[1];
-+
-+ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+ const qpu_mc_src_addr_t src_base_u = get_mc_address_u(src_frame);
-+ const uint32_t x_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_X(mv), 2 + hshift) << (1 - hshift)];
-+ const uint32_t y_coeffs = rpi_filter_coefs[av_mod_uintp2(MV_Y(mv), 2 + vshift) << (1 - vshift)];
-+ const uint32_t wo_u = pack_wo_p(c_offsets[0], c_weights[0]);
-+ const uint32_t wo_v = pack_wo_p(c_offsets[1], c_weights[1]);
-+ qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+ const unsigned int bh = nPbH_c;
-+ const uint32_t qfn = lx == 0 ? s->qpu.c_pxx : s->qpu.c_pxx_l1;
-+
-+ for(int start_x=0; start_x < nPbW_c; start_x+=RPI_CHROMA_BLOCK_WIDTH)
-+ {
-+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh + 3, qfn);
-+ qpu_mc_pred_c_p_t * const cmd_c = &cp->qpu_mc_curr->c.p;
-+ qpu_mc_src_t ** const plast_lx = (lx == 0) ? &cp->last_l0 : &cp->last_l1;
-+ qpu_mc_src_t * const last_lx = *plast_lx;
-+ const int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+ last_lx->x = x1_c + start_x;
-+ last_lx->y = y1_c;
-+ last_lx->base = src_base_u;
-+ cmd_c->h = bh;
-+ cmd_c->w = bw;
-+ cmd_c->coeffs_x = x_coeffs;
-+ cmd_c->coeffs_y = y_coeffs;
-+ cmd_c->wo_u = wo_u;
-+ cmd_c->wo_v = wo_v;
-+ cmd_c->dst_addr_c = dst_base_u + (start_x << xshl);
-+ *plast_lx = &cmd_c->next_src;
-+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(cmd_c + 1);
-+ }
-+ return;
-+}
-+
-+// h/v shifts fixed at one as that is all the qasm copes with
-+static void
-+rpi_pred_c_b(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const int x0_c, const int y0_c,
-+ const int nPbW_c, const int nPbH_c,
-+ const struct HEVCRpiMvField * const mv_field,
-+ const int16_t * const c_weights,
-+ const int16_t * const c_offsets,
-+ const int16_t * const c_weights2,
-+ const int16_t * const c_offsets2,
-+ AVFrame * const src_frame,
-+ AVFrame * const src_frame2)
-+{
-+ const unsigned int c_off = av_rpi_sand_frame_off_c(s->frame, x0_c, y0_c);
-+ const int hshift = 1; // s->ps.sps->hshift[1];
-+ const int vshift = 1; // s->ps.sps->vshift[1];
-+ const MvXY const mv = mv_field->xy[0];
-+ const MvXY const mv2 = mv_field->xy[1];
-+
-+ const unsigned int mx = av_mod_uintp2(MV_X(mv), 2 + hshift);
-+ const unsigned int my = av_mod_uintp2(MV_Y(mv), 2 + vshift);
-+ const uint32_t coefs0_x = rpi_filter_coefs[mx << (1 - hshift)];
-+ const uint32_t coefs0_y = rpi_filter_coefs[my << (1 - vshift)]; // Fractional part of motion vector
-+ const int x1_c = x0_c + (MV_X(mv) >> (2 + hshift)) - 1;
-+ const int y1_c = y0_c + (MV_Y(mv) >> (2 + hshift)) - 1;
-+
-+ const unsigned int mx2 = av_mod_uintp2(MV_X(mv2), 2 + hshift);
-+ const unsigned int my2 = av_mod_uintp2(MV_Y(mv2), 2 + vshift);
-+ const uint32_t coefs1_x = rpi_filter_coefs[mx2 << (1 - hshift)];
-+ const uint32_t coefs1_y = rpi_filter_coefs[my2 << (1 - vshift)]; // Fractional part of motion vector
-+
-+ const int x2_c = x0_c + (MV_X(mv2) >> (2 + hshift)) - 1;
-+ const int y2_c = y0_c + (MV_Y(mv2) >> (2 + hshift)) - 1;
-+
-+ const uint32_t wo_u2 = pack_wo_b(c_offsets[0], c_offsets2[0], c_weights2[0]);
-+ const uint32_t wo_v2 = pack_wo_b(c_offsets[1], c_offsets2[1], c_weights2[1]);
-+
-+ const qpu_mc_dst_addr_t dst_base_u = get_mc_address_u(s->frame) + c_off;
-+ const qpu_mc_src_addr_t src1_base = get_mc_address_u(src_frame);
-+ const qpu_mc_src_addr_t src2_base = get_mc_address_u(src_frame2);
-+ HEVCRpiInterPredEnv * const ipe = &jb->chroma_ip;
-+ const unsigned int xshl = av_rpi_sand_frame_xshl(s->frame) + 1;
-+ const unsigned int bh = nPbH_c;
-+
-+ for (int start_x=0; start_x < nPbW_c; start_x += RPI_CHROMA_BLOCK_WIDTH)
-+ {
-+ const unsigned int bw = FFMIN(nPbW_c-start_x, RPI_CHROMA_BLOCK_WIDTH);
-+
-+ HEVCRpiInterPredQ * const cp = rpi_nxt_pred(ipe, bh * 2 + 3, s->qpu.c_bxx);
-+ qpu_mc_pred_c_b_t * const u = &cp->qpu_mc_curr->c.b;
-+ qpu_mc_src_t * const src_l0 = cp->last_l0;
-+ qpu_mc_src_t * const src_l1 = cp->last_l1;
-+
-+ src_l0->x = x1_c + start_x;
-+ src_l0->y = y1_c;
-+ src_l0->base = src1_base;
-+ src_l1->x = x2_c + start_x;
-+ src_l1->y = y2_c;
-+ src_l1->base = src2_base;
-+
-+ u[0].h = bh;
-+ u[0].w = bw;
-+ u[0].coeffs_x1 = coefs0_x;
-+ u[0].coeffs_y1 = coefs0_y;
-+ u[0].weight_u1 = c_weights[0]; // Weight L0 U
-+ u[0].weight_v1 = c_weights[1]; // Weight L0 V
-+ u[0].coeffs_x2 = coefs1_x;
-+ u[0].coeffs_y2 = coefs1_y;
-+ u[0].wo_u2 = wo_u2;
-+ u[0].wo_v2 = wo_v2;
-+ u[0].dst_addr_c = dst_base_u + (start_x << xshl);
-+
-+ cp->last_l0 = &u[0].next_src1;
-+ cp->last_l1 = &u[0].next_src2;
-+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+ }
-+}
-+
-+
-+static inline void
-+col_stash(const HEVCRpiContext * const s,
-+ const unsigned int x0, const unsigned int y0, const unsigned int w0, const unsigned int h0,
-+ const HEVCRpiMvField * const mvf)
-+{
-+ ColMvField * const col_mvf = s->ref->col_mvf;
-+ const unsigned int x = (x0 + 15) >> 4;
-+ const unsigned int y = (y0 + 15) >> 4;
-+ const unsigned int w = ((x0 + 15 + w0) >> 4) - x;
-+ const unsigned int h = ((y0 + 15 + h0) >> 4) - y;
-+
-+ if (col_mvf != NULL && w != 0 && h != 0)
-+ {
-+ // Only record MV from the top left of the 16x16 block
-+
-+ const RefPicList * const rpl = s->refPicList;
-+ const ColMvField cmv = {
-+ .L = {
-+ {
-+ .poc = (mvf->pred_flag & PF_L0) == 0 ?
-+ COL_POC_INTRA :
-+ COL_POC_MAKE_INTER(rpl[0].isLongTerm[mvf->ref_idx[0]], rpl[0].list[mvf->ref_idx[0]]),
-+ .xy = mvf->xy[0]
-+ },
-+ {
-+ .poc = (mvf->pred_flag & PF_L1) == 0 ?
-+ COL_POC_INTRA :
-+ COL_POC_MAKE_INTER(rpl[1].isLongTerm[mvf->ref_idx[1]], rpl[1].list[mvf->ref_idx[1]]),
-+ .xy = mvf->xy[1]
-+ }
-+ }
-+ };
-+
-+ ColMvField * p = col_mvf + y * s->col_mvf_stride + x;
-+ const unsigned int stride = s->col_mvf_stride - w;
-+ unsigned int j = h;
-+
-+ do
-+ {
-+ unsigned int k = w;
-+ do
-+ {
-+ *p++ = cmv;
-+ } while (--k != 0);
-+ p += stride;
-+ } while (--j != 0);
-+ }
-+}
-+
-+static void hls_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int nPbW, const unsigned int nPbH,
-+ const unsigned int log2_cb_size, const unsigned int partIdx, const unsigned int idx)
-+{
-+ HEVCRpiJob * const jb = lc->jb0;
-+
-+ struct HEVCRpiMvField current_mv = {{0}};
-+ const RefPicList *const refPicList = s->refPicList;
-+ const HEVCRpiFrame *ref0 = NULL, *ref1 = NULL;
-+
-+ if (lc->cu.pred_mode != MODE_SKIP)
-+ lc->pu.merge_flag = ff_hevc_rpi_merge_flag_decode(lc);
-+
-+ if (lc->cu.pred_mode == MODE_SKIP || lc->pu.merge_flag) {
-+ const unsigned int merge_idx = s->sh.max_num_merge_cand <= 1 ? 0 :
-+ ff_hevc_rpi_merge_idx_decode(s, lc);
-+
-+ ff_hevc_rpi_luma_mv_merge_mode(s, lc, x0, y0, nPbW, nPbH, log2_cb_size,
-+ partIdx, merge_idx, &current_mv);
-+ } else {
-+ hevc_luma_mv_mvp_mode(s, lc, x0, y0, nPbW, nPbH, &current_mv);
-+ }
-+
-+ {
-+ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+ unsigned int i, j;
-+
-+ for (j = 0; j < nPbH >> LOG2_MIN_PU_SIZE; j++)
-+ {
-+ for (i = 0; i < nPbW >> LOG2_MIN_PU_SIZE; i++)
-+ p[i] = current_mv;
-+ p += MVF_STASH_WIDTH_PU;
-+ }
-+ }
-+
-+ col_stash(s, x0, y0, nPbW, nPbH, &current_mv);
-+
-+ if (current_mv.pred_flag & PF_L0) {
-+ ref0 = refPicList[0].ref[current_mv.ref_idx[0]];
-+ if (!ref0)
-+ return;
-+ hevc_await_progress(s, lc, ref0, current_mv.xy[0], y0, nPbH);
-+ }
-+ if (current_mv.pred_flag & PF_L1) {
-+ ref1 = refPicList[1].ref[current_mv.ref_idx[1]];
-+ if (!ref1)
-+ return;
-+ hevc_await_progress(s, lc, ref1, current_mv.xy[1], y0, nPbH);
-+ }
-+
-+ if (current_mv.pred_flag == PF_L0) {
-+ const int x0_c = x0 >> ctx_hshift(s, 1);
-+ const int y0_c = y0 >> ctx_vshift(s, 1);
-+ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[0],
-+ s->sh.luma_weight_l0[current_mv.ref_idx[0]], s->sh.luma_offset_l0[current_mv.ref_idx[0]],
-+ ref0->frame);
-+
-+ if (ctx_cfmt(s) != 0) {
-+ rpi_pred_c(s, jb, 0, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[0],
-+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]], s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+ ref0->frame);
-+ return;
-+ }
-+ } else if (current_mv.pred_flag == PF_L1) {
-+ const int x0_c = x0 >> ctx_hshift(s, 1);
-+ const int y0_c = y0 >> ctx_vshift(s, 1);
-+ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+ rpi_pred_y(s, jb, x0, y0, nPbW, nPbH, current_mv.xy[1],
-+ s->sh.luma_weight_l1[current_mv.ref_idx[1]], s->sh.luma_offset_l1[current_mv.ref_idx[1]],
-+ ref1->frame);
-+
-+ if (ctx_cfmt(s) != 0) {
-+ rpi_pred_c(s, jb, 1, x0_c, y0_c, nPbW_c, nPbH_c, current_mv.xy[1],
-+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]], s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+ ref1->frame);
-+ return;
-+ }
-+ } else if (current_mv.pred_flag == PF_BI) {
-+ const int x0_c = x0 >> ctx_hshift(s, 1);
-+ const int y0_c = y0 >> ctx_vshift(s, 1);
-+ const int nPbW_c = nPbW >> ctx_hshift(s, 1);
-+ const int nPbH_c = nPbH >> ctx_vshift(s, 1);
-+
-+ rpi_pred_y_b(s, jb, x0, y0, nPbW, nPbH, &current_mv, ref0->frame, ref1->frame);
-+
-+ if (ctx_cfmt(s) != 0) {
-+ rpi_pred_c_b(s, jb, x0_c, y0_c, nPbW_c, nPbH_c,
-+ &current_mv,
-+ s->sh.chroma_weight_l0[current_mv.ref_idx[0]],
-+ s->sh.chroma_offset_l0[current_mv.ref_idx[0]],
-+ s->sh.chroma_weight_l1[current_mv.ref_idx[1]],
-+ s->sh.chroma_offset_l1[current_mv.ref_idx[1]],
-+ ref0->frame,
-+ ref1->frame);
-+ return;
-+ }
-+ }
-+}
-+
-+static void set_ipm(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_cb_size,
-+ const unsigned int ipm)
-+{
-+ const unsigned int x_pu = x0 >> LOG2_MIN_PU_SIZE;
-+ const unsigned int y_pu = y0 >> LOG2_MIN_PU_SIZE;
-+
-+ {
-+ const unsigned int ctb_mask = ~(~0U << (s->ps.sps->log2_ctb_size - LOG2_MIN_PU_SIZE));
-+ set_stash2(lc->ipm_left + (y_pu & ctb_mask), lc->ipm_up + (x_pu & ctb_mask), log2_cb_size - LOG2_MIN_PU_SIZE, ipm);
-+ }
-+
-+ // If IRAP then everything is Intra & we avoid ever looking at these
-+ // stashes so don't bother setting them
-+ if (!s->is_irap && lc->cu.pred_mode == MODE_INTRA)
-+ {
-+ if (s->is_intra != NULL)
-+ {
-+ set_bits(s->is_intra + (y0 >> LOG2_MIN_CU_SIZE) * s->ps.sps->pcm_width, x0 >> LOG2_MIN_CU_SIZE, s->ps.sps->pcm_width, log2_cb_size - LOG2_MIN_CU_SIZE);
-+ }
-+
-+ {
-+ HEVCRpiMvField * p = mvf_stash_ptr(s, lc, x0, y0);
-+ const unsigned int size_in_pus = (1 << log2_cb_size) >> LOG2_MIN_PU_SIZE; // min_pu <= log2_cb so >= 1
-+ unsigned int n = size_in_pus;
-+
-+ do
-+ {
-+ memset(p, 0, size_in_pus * sizeof(*p));
-+ p += MVF_STASH_WIDTH_PU;
-+ } while (--n != 0);
-+ }
-+
-+
-+ if (s->ref->col_mvf != NULL && ((x0 | y0) & 0xf) == 0)
-+ {
-+ // Only record top left stuff
-+ // Blocks should always be alinged on size boundries
-+ // so cannot have overflow from a small block
-+
-+ ColMvField * p = s->ref->col_mvf + (y0 >> 4) * s->col_mvf_stride + (x0 >> 4);
-+ const unsigned int size_in_col = log2_cb_size < 4 ? 1 : (1 << (log2_cb_size - 4));
-+ const unsigned int stride = s->col_mvf_stride - size_in_col;
-+ unsigned int j = size_in_col;
-+
-+ do
-+ {
-+ unsigned int k = size_in_col;
-+ do
-+ {
-+ p->L[0].poc = COL_POC_INTRA;
-+ p->L[0].xy = 0;
-+ p->L[1].poc = COL_POC_INTRA;
-+ p->L[1].xy = 0;
-+ ++p;
-+ } while (--k != 0);
-+ p += stride;
-+ } while (--j != 0);
-+ }
-+ }
-+}
-+
-+static inline void intra_prediction_unit_default_value(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_cb_size)
-+{
-+ set_ipm(s, lc, x0, y0, log2_cb_size, INTRA_DC);
-+}
-+
-+
-+/**
-+ * 8.4.1
-+ */
-+static int luma_intra_pred_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ int x0, int y0, int log2_pu_size,
-+ int prev_intra_luma_pred_flag,
-+ const unsigned int idx)
-+{
-+ const unsigned int ctb_mask = ~(~0U << s->ps.sps->log2_ctb_size);
-+ const unsigned int xb_pu = (x0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+ const unsigned int yb_pu = (y0 & ctb_mask) >> LOG2_MIN_PU_SIZE;
-+
-+ // Up does not cross boundries so as we always scan 1 slice-tile-line in an
-+ // lc we can just keep 1 CTB lR stashes
-+ // Left is reset to DC @ Start of Line/Tile/Slice in fill_job
-+ const unsigned int cand_up = yb_pu == 0 ? INTRA_DC : lc->ipm_up[xb_pu];
-+ const unsigned int cand_left = lc->ipm_left[yb_pu];
-+
-+ unsigned int intra_pred_mode;
-+ unsigned int a, b, c;
-+
-+ if (cand_left == cand_up) {
-+ if (cand_left < 2) {
-+ a = INTRA_PLANAR;
-+ b = INTRA_DC;
-+ c = INTRA_ANGULAR_26;
-+ } else {
-+ a = cand_left;
-+ b = 2 + ((cand_left - 2 - 1 + 32) & 31);
-+ c = 2 + ((cand_left - 2 + 1) & 31);
-+ }
-+ } else {
-+ a = cand_left;
-+ b = cand_up;
-+ c = (cand_left != INTRA_PLANAR && cand_up != INTRA_PLANAR) ?
-+ INTRA_PLANAR :
-+ (cand_left != INTRA_DC && cand_up != INTRA_DC) ?
-+ INTRA_DC :
-+ INTRA_ANGULAR_26;
-+ }
-+
-+ if (prev_intra_luma_pred_flag) {
-+ intra_pred_mode = idx == 0 ? a : idx == 1 ? b : c;
-+ } else {
-+ // Sort lowest 1st
-+ if (a > b)
-+ FFSWAP(int, a, b);
-+ if (a > c)
-+ FFSWAP(int, a, c);
-+ if (b > c)
-+ FFSWAP(int, b, c);
-+
-+ intra_pred_mode = idx;
-+ if (intra_pred_mode >= a)
-+ intra_pred_mode++;
-+ if (intra_pred_mode >= b)
-+ intra_pred_mode++;
-+ if (intra_pred_mode >= c)
-+ intra_pred_mode++;
-+ }
-+
-+ /* write the intra prediction units into the mv array */
-+ set_ipm(s, lc, x0, y0, log2_pu_size, intra_pred_mode);
-+ return intra_pred_mode;
-+}
-+
-+static const uint8_t tab_mode_idx[] = {
-+ 0, 1, 2, 2, 2, 2, 3, 5, 7, 8, 10, 12, 13, 15, 17, 18, 19, 20,
-+ 21, 22, 23, 23, 24, 24, 25, 25, 26, 27, 27, 28, 28, 29, 29, 30, 31};
-+
-+static void intra_prediction_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_cb_size)
-+{
-+ static const uint8_t intra_chroma_table[4] = { 0, 26, 10, 1 };
-+ uint8_t prev_intra_luma_pred_flag[4];
-+ int split = lc->cu.part_mode == PART_NxN;
-+ const unsigned int split_size = (1 << (log2_cb_size - 1));
-+ int chroma_mode;
-+ const unsigned int n = split ? 4 : 1;
-+ unsigned int i;
-+
-+ for (i = 0; i != n; i++)
-+ prev_intra_luma_pred_flag[i] = ff_hevc_rpi_prev_intra_luma_pred_flag_decode(lc);
-+
-+ for (i = 0; i < n; i++) {
-+ // depending on mode idx is mpm or luma_pred_mode
-+ const unsigned int idx = prev_intra_luma_pred_flag[i] ?
-+ ff_hevc_rpi_mpm_idx_decode(lc) :
-+ ff_hevc_rpi_rem_intra_luma_pred_mode_decode(lc);
-+
-+ lc->pu.intra_pred_mode[i] =
-+ luma_intra_pred_mode(s, lc,
-+ x0 + ((i & 1) == 0 ? 0 : split_size),
-+ y0 + ((i & 2) == 0 ? 0 : split_size),
-+ log2_cb_size - split,
-+ prev_intra_luma_pred_flag[i], idx);
-+ }
-+
-+ if (ctx_cfmt(s) == 3) {
-+ for (i = 0; i < n; i++) {
-+ lc->pu.chroma_mode_c[i] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+ if (chroma_mode != 4) {
-+ if (lc->pu.intra_pred_mode[i] == intra_chroma_table[chroma_mode])
-+ lc->pu.intra_pred_mode_c[i] = 34;
-+ else
-+ lc->pu.intra_pred_mode_c[i] = intra_chroma_table[chroma_mode];
-+ } else {
-+ lc->pu.intra_pred_mode_c[i] = lc->pu.intra_pred_mode[i];
-+ }
-+ }
-+ } else if (ctx_cfmt(s) == 2) {
-+ int mode_idx;
-+ lc->pu.chroma_mode_c[0] = chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+ if (chroma_mode != 4) {
-+ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+ mode_idx = 34;
-+ else
-+ mode_idx = intra_chroma_table[chroma_mode];
-+ } else {
-+ mode_idx = lc->pu.intra_pred_mode[0];
-+ }
-+ lc->pu.intra_pred_mode_c[0] = tab_mode_idx[mode_idx];
-+ } else if (ctx_cfmt(s) != 0) {
-+ chroma_mode = ff_hevc_rpi_intra_chroma_pred_mode_decode(lc);
-+ if (chroma_mode != 4) {
-+ if (lc->pu.intra_pred_mode[0] == intra_chroma_table[chroma_mode])
-+ lc->pu.intra_pred_mode_c[0] = 34;
-+ else
-+ lc->pu.intra_pred_mode_c[0] = intra_chroma_table[chroma_mode];
-+ } else {
-+ lc->pu.intra_pred_mode_c[0] = lc->pu.intra_pred_mode[0];
-+ }
-+ }
-+}
-+
-+static int hls_coding_unit(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0, const unsigned int log2_cb_size)
-+{
-+ const unsigned int cb_size = 1 << log2_cb_size;
-+ const unsigned int log2_min_cb_size = s->ps.sps->log2_min_cb_size;
-+ const unsigned int min_cb_width = s->ps.sps->min_cb_width;
-+ const unsigned int x_cb = x0 >> log2_min_cb_size;
-+ const unsigned int y_cb = y0 >> log2_min_cb_size;
-+ const unsigned int idx = log2_cb_size - 2;
-+ const unsigned int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+ int skip_flag = 0;
-+
-+ lc->cu.x = x0;
-+ lc->cu.y = y0;
-+ lc->cu.x_split = x0;
-+ lc->cu.y_split = y0;
-+
-+ lc->cu.pred_mode = MODE_INTRA;
-+ lc->cu.part_mode = PART_2Nx2N;
-+ lc->cu.intra_split_flag = 0;
-+ lc->cu.cu_transquant_bypass_flag = 0;
-+ lc->pu.intra_pred_mode[0] = 1;
-+ lc->pu.intra_pred_mode[1] = 1;
-+ lc->pu.intra_pred_mode[2] = 1;
-+ lc->pu.intra_pred_mode[3] = 1;
-+
-+ if (s->ps.pps->transquant_bypass_enable_flag) {
-+ lc->cu.cu_transquant_bypass_flag = ff_hevc_rpi_cu_transquant_bypass_flag_decode(lc);
-+ if (lc->cu.cu_transquant_bypass_flag)
-+ set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+ }
-+
-+ if (s->sh.slice_type != HEVC_SLICE_I) {
-+ lc->cu.pred_mode = MODE_INTER;
-+ skip_flag = ff_hevc_rpi_skip_flag_decode(s, lc, x0, y0, x_cb, y_cb);
-+ }
-+
-+ if (skip_flag) {
-+ lc->cu.pred_mode = MODE_SKIP;
-+
-+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+
-+ if (!s->sh.disable_deblocking_filter_flag)
-+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+ } else {
-+ int pcm_flag = 0;
-+
-+ if (s->sh.slice_type != HEVC_SLICE_I)
-+ lc->cu.pred_mode = ff_hevc_rpi_pred_mode_decode(lc);
-+ if (lc->cu.pred_mode != MODE_INTRA ||
-+ log2_cb_size == s->ps.sps->log2_min_cb_size) {
-+ lc->cu.part_mode = ff_hevc_rpi_part_mode_decode(s, lc, log2_cb_size);
-+ lc->cu.intra_split_flag = lc->cu.part_mode == PART_NxN &&
-+ lc->cu.pred_mode == MODE_INTRA;
-+ }
-+
-+ if (lc->cu.pred_mode == MODE_INTRA) {
-+ if (lc->cu.part_mode == PART_2Nx2N &&
-+ log2_cb_size <= s->ps.sps->pcm.log2_max_pcm_cb_size && // 0 if not enabled
-+ log2_cb_size >= s->ps.sps->pcm.log2_min_pcm_cb_size &&
-+ ff_hevc_rpi_pcm_flag_decode(lc) != 0)
-+ {
-+ int ret;
-+ pcm_flag = 1;
-+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+ if ((ret = hls_pcm_sample(s, lc, x0, y0, log2_cb_size)) < 0)
-+ return ret;
-+
-+ if (s->ps.sps->pcm.loop_filter_disable_flag)
-+ set_deblocking_bypass(s, x0, y0, log2_cb_size);
-+ } else {
-+ intra_prediction_unit(s, lc, x0, y0, log2_cb_size);
-+ }
-+ } else {
-+ intra_prediction_unit_default_value(s, lc, x0, y0, log2_cb_size);
-+ switch (lc->cu.part_mode) {
-+ case PART_2Nx2N:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size, log2_cb_size, 0, idx);
-+ break;
-+ case PART_2NxN:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 2, log2_cb_size, 0, idx);
-+ lc->cu.y_split = y0 + cb_size / 2;
-+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size, cb_size / 2, log2_cb_size, 1, idx);
-+ break;
-+ case PART_Nx2N:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size, log2_cb_size, 0, idx - 1);
-+ lc->cu.x_split = x0 + cb_size / 2;
-+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size, log2_cb_size, 1, idx - 1);
-+ break;
-+ case PART_2NxnU:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4, log2_cb_size, 0, idx);
-+ lc->cu.y_split = y0 + cb_size / 4;
-+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4, cb_size, cb_size / 4 * 3, log2_cb_size, 1, idx);
-+ break;
-+ case PART_2NxnD:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size, cb_size / 4 * 3, log2_cb_size, 0, idx);
-+ lc->cu.y_split = y0 + cb_size / 4 * 3;
-+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 4 * 3, cb_size, cb_size / 4, log2_cb_size, 1, idx);
-+ break;
-+ case PART_nLx2N:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size / 4, cb_size, log2_cb_size, 0, idx - 2);
-+ lc->cu.x_split = x0 + cb_size / 4;
-+ hls_prediction_unit(s, lc, x0 + cb_size / 4, y0, cb_size * 3 / 4, cb_size, log2_cb_size, 1, idx - 2);
-+ break;
-+ case PART_nRx2N:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size / 4 * 3, cb_size, log2_cb_size, 0, idx - 2);
-+ lc->cu.x_split = x0 + cb_size / 4 * 3;
-+ hls_prediction_unit(s, lc, x0 + cb_size / 4 * 3, y0, cb_size / 4, cb_size, log2_cb_size, 1, idx - 2);
-+ break;
-+ case PART_NxN:
-+ hls_prediction_unit(s, lc, x0, y0, cb_size / 2, cb_size / 2, log2_cb_size, 0, idx - 1);
-+ lc->cu.x_split = x0 + cb_size / 2;
-+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0, cb_size / 2, cb_size / 2, log2_cb_size, 1, idx - 1);
-+ lc->cu.y_split = y0 + cb_size / 2;
-+ hls_prediction_unit(s, lc, x0, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 2, idx - 1);
-+ hls_prediction_unit(s, lc, x0 + cb_size / 2, y0 + cb_size / 2, cb_size / 2, cb_size / 2, log2_cb_size, 3, idx - 1);
-+ break;
-+ }
-+ }
-+
-+ if (!pcm_flag) {
-+ int rqt_root_cbf = 1;
-+
-+ if (lc->cu.pred_mode != MODE_INTRA &&
-+ !(lc->cu.part_mode == PART_2Nx2N && lc->pu.merge_flag)) {
-+ rqt_root_cbf = ff_hevc_rpi_no_residual_syntax_flag_decode(lc);
-+ }
-+ if (rqt_root_cbf) {
-+ const unsigned int cbf_c = ctx_cfmt(s) == 0 ? 0 : (CBF_CR0 | CBF_CB0);
-+ int ret;
-+
-+ lc->cu.max_trafo_depth = lc->cu.pred_mode == MODE_INTRA ?
-+ s->ps.sps->max_transform_hierarchy_depth_intra + lc->cu.intra_split_flag :
-+ s->ps.sps->max_transform_hierarchy_depth_inter;
-+ // transform_tree does deblock_boundary_strengths
-+ ret = hls_transform_tree(s, lc, x0, y0,
-+ log2_cb_size, 0, 0, cbf_c);
-+ if (ret < 0)
-+ return ret;
-+ } else {
-+ if (!s->sh.disable_deblocking_filter_flag)
-+ ff_hevc_rpi_deblocking_boundary_strengths(s, lc, x0, y0, log2_cb_size, 0);
-+ }
-+ }
-+ }
-+
-+ // If the delta is still wanted then we haven't read the delta & therefore need to set qp here
-+ if (lc->tu.is_cu_qp_delta_wanted)
-+ ff_hevc_rpi_set_qPy(s, lc, x0, y0);
-+
-+ if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+ ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0) {
-+ lc->qPy_pred = lc->qp_y;
-+ }
-+
-+ set_bytes(s->qp_y_tab + y_cb * min_cb_width + x_cb, min_cb_width, log2_cb_size - log2_min_cb_size, lc->qp_y & 0xff);
-+
-+ set_stash2(s->cabac_stash_up + (x0 >> 3), s->cabac_stash_left + (y0 >> 3), log2_cb_size - 3, (lc->ct_depth << 1) | skip_flag);
-+
-+ return 0;
-+}
-+
-+// Returns:
-+// < 0 Error
-+// 0 More data wanted
-+// 1 EoSlice / EoPicture
-+static int hls_coding_quadtree(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const int x0, const int y0,
-+ const int log2_cb_size, const unsigned int cb_depth)
-+{
-+ const int cb_size = 1 << log2_cb_size;
-+ int ret;
-+ int split_cu;
-+
-+ lc->ct_depth = cb_depth;
-+ split_cu = (log2_cb_size > s->ps.sps->log2_min_cb_size);
-+ if (x0 + cb_size <= s->ps.sps->width &&
-+ y0 + cb_size <= s->ps.sps->height &&
-+ split_cu)
-+ {
-+ split_cu = ff_hevc_rpi_split_coding_unit_flag_decode(s, lc, cb_depth, x0, y0);
-+ }
-+
-+ // Qp delta (and offset) need to remain wanted if cb_size < min until
-+ // a coded block is found so we still initial state at depth 0 (outside
-+ // this fn) and only reset here
-+ if (s->ps.pps->cu_qp_delta_enabled_flag &&
-+ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+ {
-+ lc->tu.is_cu_qp_delta_wanted = 1;
-+ lc->tu.cu_qp_delta = 0;
-+ }
-+ if (s->sh.cu_chroma_qp_offset_enabled_flag &&
-+ log2_cb_size >= s->ps.pps->log2_min_cu_qp_delta_size)
-+ {
-+ lc->tu.cu_chroma_qp_offset_wanted = 1;
-+ }
-+
-+ lc->tu.qp_divmod6[0] = s->ps.pps->qp_bd_x[0];
-+ lc->tu.qp_divmod6[1] = s->ps.pps->qp_bd_x[1] + s->sh.slice_cb_qp_offset;
-+ lc->tu.qp_divmod6[2] = s->ps.pps->qp_bd_x[2] + s->sh.slice_cr_qp_offset;
-+
-+ if (split_cu) {
-+ int qp_block_mask = (1 << s->ps.pps->log2_min_cu_qp_delta_size) - 1;
-+ const int cb_size_split = cb_size >> 1;
-+ const int x1 = x0 + cb_size_split;
-+ const int y1 = y0 + cb_size_split;
-+
-+ int more_data = 0;
-+
-+ more_data = hls_coding_quadtree(s, lc, x0, y0, log2_cb_size - 1, cb_depth + 1);
-+ if (more_data < 0)
-+ return more_data;
-+
-+ if (more_data && x1 < s->ps.sps->width) {
-+ more_data = hls_coding_quadtree(s, lc, x1, y0, log2_cb_size - 1, cb_depth + 1);
-+ if (more_data < 0)
-+ return more_data;
-+ }
-+ if (more_data && y1 < s->ps.sps->height) {
-+ more_data = hls_coding_quadtree(s, lc, x0, y1, log2_cb_size - 1, cb_depth + 1);
-+ if (more_data < 0)
-+ return more_data;
-+ }
-+ if (more_data && x1 < s->ps.sps->width &&
-+ y1 < s->ps.sps->height) {
-+ more_data = hls_coding_quadtree(s, lc, x1, y1, log2_cb_size - 1, cb_depth + 1);
-+ if (more_data < 0)
-+ return more_data;
-+ }
-+
-+ if(((x0 + (1<<log2_cb_size)) & qp_block_mask) == 0 &&
-+ ((y0 + (1<<log2_cb_size)) & qp_block_mask) == 0)
-+ lc->qPy_pred = lc->qp_y;
-+
-+ if (more_data)
-+ return ((x1 + cb_size_split) < s->ps.sps->width ||
-+ (y1 + cb_size_split) < s->ps.sps->height);
-+ else
-+ return 0;
-+ } else {
-+ ret = hls_coding_unit(s, lc, x0, y0, log2_cb_size);
-+ if (ret < 0)
-+ return ret;
-+ if ((!((x0 + cb_size) %
-+ (1 << (s->ps.sps->log2_ctb_size))) ||
-+ (x0 + cb_size >= s->ps.sps->width)) &&
-+ (!((y0 + cb_size) %
-+ (1 << (s->ps.sps->log2_ctb_size))) ||
-+ (y0 + cb_size >= s->ps.sps->height))) {
-+ int end_of_slice_flag = ff_hevc_rpi_get_cabac_terminate(&lc->cc);
-+ return !end_of_slice_flag;
-+ } else {
-+ return 1;
-+ }
-+ }
-+
-+ return 0; // NEVER
-+}
-+
-+static void hls_decode_neighbour(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const int x_ctb, const int y_ctb, const int ctb_addr_ts)
-+{
-+ const unsigned int ctb_size = 1 << s->ps.sps->log2_ctb_size;
-+ const unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+ const unsigned int ctb_addr_rs_in_slice = ctb_addr_rs - s->sh.slice_addr; // slice_addr = RS addr of start of slice
-+ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+ const unsigned int line_w = s->ps.sps->ctb_width;
-+
-+ s->tab_slice_address[ctb_addr_rs] = s->sh.slice_addr;
-+
-+ lc->end_of_ctb_x = FFMIN(x_ctb + ctb_size, s->ps.sps->width);
-+ lc->end_of_ctb_y = FFMIN(y_ctb + ctb_size, s->ps.sps->height);
-+
-+ lc->boundary_flags = 0;
-+
-+ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0)
-+ lc->boundary_flags |= BOUNDARY_LEFT_TILE;
-+ if (x_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - 1])
-+ lc->boundary_flags |= BOUNDARY_LEFT_SLICE;
-+ if ((ctb_flags & CTB_TS_FLAGS_TOT) != 0)
-+ lc->boundary_flags |= BOUNDARY_UPPER_TILE;
-+ if (y_ctb > 0 && s->tab_slice_address[ctb_addr_rs] != s->tab_slice_address[ctb_addr_rs - line_w])
-+ lc->boundary_flags |= BOUNDARY_UPPER_SLICE;
-+
-+ // Use line width rather than tile width for addr_in_slice test as
-+ // addr_in_slice is in raster units
-+
-+ lc->ctb_avail =
-+ ((lc->boundary_flags & (BOUNDARY_LEFT_SLICE | BOUNDARY_LEFT_TILE)) == 0 ? AVAIL_L : 0) |
-+ ((lc->boundary_flags & (BOUNDARY_UPPER_SLICE | BOUNDARY_UPPER_TILE)) == 0 ? AVAIL_U : 0) |
-+ ((lc->boundary_flags & (BOUNDARY_LEFT_TILE | BOUNDARY_UPPER_TILE)) == 0 &&
-+ (ctb_addr_rs_in_slice > line_w) ? AVAIL_UL : 0) |
-+ ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_TOT)) == 0 &&
-+ (ctb_addr_rs_in_slice + 1 >= line_w) ? AVAIL_UR : 0);
-+ // Down-left never avail at CTB level
-+}
-+
-+
-+static void rpi_execute_dblk_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+ int y = ff_hevc_rpi_hls_filter_blk(s, jb->bounds,
-+ (s->ps.pps->ctb_ts_flags[jb->ctu_ts_last] & CTB_TS_FLAGS_EOT) != 0);
-+
-+ // Signal
-+ if (y > 0) {
-+ // Cast away const as progress is held in s, but this really shouldn't confuse anything
-+ ff_hevc_rpi_progress_signal_recon((HEVCRpiContext *)s, y - 1);
-+ }
-+
-+ // Job done now
-+ // ? Move outside this fn
-+ job_free(s->jbc, jb);
-+}
-+
-+// I-pred, transform_and_add for all blocks types done here
-+// All ARM
-+static void rpi_execute_pred_cmds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+ unsigned int i;
-+ HEVCRpiIntraPredEnv * const iap = &jb->intra;
-+ const HEVCPredCmd *cmd = iap->cmds;
-+
-+#if !RPI_WORKER_WAIT_PASS_0
-+ rpi_sem_wait(&jb->sem);
-+ rpi_cache_flush_execute(jb->rfe); // Invalidate data set up in pass1
-+#endif
-+
-+ for (i = iap->n; i > 0; i--, cmd++)
-+ {
-+ switch (cmd->type)
-+ {
-+ case RPI_PRED_INTRA:
-+ s->hpc.intra_pred[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail);
-+ break;
-+ case RPI_PRED_INTRA_C:
-+ s->hpc.intra_pred_c[cmd->size - 2](s, cmd->i_pred.mode, cmd->i_pred.x, cmd->i_pred.y, cmd->avail);
-+ break;
-+ case RPI_PRED_ADD_RESIDUAL:
-+ s->hevcdsp.add_residual[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+ break;
-+ case RPI_PRED_ADD_DC:
-+ s->hevcdsp.add_residual_dc[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+ break;
-+ case RPI_PRED_ADD_RESIDUAL_U:
-+ s->hevcdsp.add_residual_u[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+ break;
-+ case RPI_PRED_ADD_RESIDUAL_V:
-+ s->hevcdsp.add_residual_v[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride, cmd->ta.dc);
-+ break;
-+ case RPI_PRED_ADD_RESIDUAL_C:
-+ s->hevcdsp.add_residual_c[cmd->size - 2](cmd->ta.dst, (int16_t *)cmd->ta.buf, cmd->ta.stride);
-+ break;
-+ case RPI_PRED_ADD_DC_U:
-+ case RPI_PRED_ADD_DC_V:
-+ s->hevcdsp.add_residual_dc_c[cmd->size - 2](cmd->dc.dst, cmd->dc.stride, cmd->dc.dc);
-+ break;
-+
-+ case RPI_PRED_I_PCM:
-+ pcm_extract(s, cmd->i_pcm.src, cmd->i_pcm.src_len, cmd->i_pcm.x, cmd->i_pcm.y, 1 << cmd->size);
-+ break;
-+
-+ default:
-+ av_log(s->avctx, AV_LOG_PANIC, "Bad command %d in worker pred Q\n", cmd->type);
-+ abort();
-+ }
-+ }
-+
-+ // Mark done
-+ iap->n = 0;
-+}
-+
-+
-+// Set initial uniform job values & zero ctu_count
-+static void rpi_begin(const HEVCRpiContext * const s, HEVCRpiJob * const jb, const unsigned int ctu_ts_first)
-+{
-+ unsigned int i;
-+ HEVCRpiInterPredEnv *const cipe = &jb->chroma_ip;
-+ HEVCRpiInterPredEnv *const yipe = &jb->luma_ip;
-+ const HEVCRpiSPS * const sps = s->ps.sps;
-+
-+ const uint16_t pic_width_y = sps->width;
-+ const uint16_t pic_height_y = sps->height;
-+
-+ const uint16_t pic_width_c = sps->width >> ctx_hshift(s, 1);
-+ const uint16_t pic_height_c = sps->height >> ctx_vshift(s, 1);
-+
-+ // We expect the pointer to change if we use another sps
-+ if (sps != jb->sps)
-+ {
-+ worker_pic_free_one(jb);
-+
-+ set_ipe_from_ici(cipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].chroma);
-+ set_ipe_from_ici(yipe, &ipe_init_infos[s->ps.sps->bit_depth - 8].luma);
-+
-+ {
-+ const int coefs_per_luma = HEVC_MAX_CTB_SIZE * HEVC_RPI_MAX_WIDTH;
-+ const int coefs_per_chroma = (coefs_per_luma * 2) >> (ctx_vshift(s, 1) + ctx_hshift(s, 1));
-+ worker_pic_alloc_one(jb, coefs_per_luma + coefs_per_chroma);
-+ }
-+
-+ jb->sps = sps;
-+ }
-+
-+ jb->waited = 0;
-+ jb->ctu_ts_first = ctu_ts_first;
-+ jb->ctu_ts_last = -1;
-+
-+ rpi_inter_pred_reset(cipe);
-+ for (i = 0; i < cipe->n; i++) {
-+ HEVCRpiInterPredQ * const cp = cipe->q + i;
-+ qpu_mc_pred_c_s_t * const u = &cp->qpu_mc_base->c.s;
-+
-+ u->next_src1.x = 0;
-+ u->next_src1.y = 0;
-+ u->next_src1.base = 0;
-+ u->pic_cw = pic_width_c;
-+ u->pic_ch = pic_height_c;
-+ u->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+ u->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+ cp->last_l0 = &u->next_src1;
-+
-+ u->next_fn = 0;
-+ u->next_src2.x = 0;
-+ u->next_src2.y = 0;
-+ u->next_src2.base = 0;
-+ cp->last_l1 = &u->next_src2;
-+
-+ cp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(u + 1);
-+ }
-+
-+ rpi_inter_pred_reset(yipe);
-+ for (i = 0; i < yipe->n; i++) {
-+ HEVCRpiInterPredQ * const yp = yipe->q + i;
-+ qpu_mc_pred_y_s_t * const y = &yp->qpu_mc_base->y.s;
-+
-+ y->next_src1.x = 0;
-+ y->next_src1.y = 0;
-+ y->next_src1.base = 0;
-+ y->next_src2.x = 0;
-+ y->next_src2.y = 0;
-+ y->next_src2.base = 0;
-+ y->pic_h = pic_height_y;
-+ y->pic_w = pic_width_y;
-+ y->stride2 = av_rpi_sand_frame_stride2(s->frame);
-+ y->stride1 = av_rpi_sand_frame_stride1(s->frame);
-+ y->next_fn = 0;
-+ yp->last_l0 = &y->next_src1;
-+ yp->last_l1 = &y->next_src2;
-+
-+ yp->qpu_mc_curr = (qpu_mc_pred_cmd_t *)(y + 1);
-+ }
-+
-+ jb->last_y8_p = NULL;
-+ jb->last_y8_l1 = NULL;
-+
-+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+ jb->progress_req[i] = -1;
-+ }
-+
-+ worker_pic_reset(&jb->coeffs);
-+}
-+
-+
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_qpu(const HEVCRpiContext * const s,
-+ const vpu_qpu_job_h vqj,
-+ rpi_cache_flush_env_t * const rfe,
-+ HEVCRpiInterPredEnv * const ipe)
-+{
-+ unsigned int i;
-+ uint32_t mail[QPU_N_MAX][QPU_MAIL_EL_VALS];
-+ unsigned int max_block = 0;
-+
-+ if (!ipe->used) {
-+ return 0;
-+ }
-+
-+ if (ipe->curr != 0) {
-+ rpi_inter_pred_sync(ipe);
-+ }
-+
-+ // Add final commands to Q
-+ for(i = 0; i != ipe->n; ++i) {
-+ HEVCRpiInterPredQ * const yp = ipe->q + i;
-+ qpu_mc_src_t *const p0 = yp->last_l0;
-+ qpu_mc_src_t *const p1 = yp->last_l1;
-+ const unsigned int block_size = (char *)yp->qpu_mc_curr - (char *)yp->qpu_mc_base;
-+
-+ if (block_size > max_block)
-+ max_block = block_size;
-+
-+ yp->qpu_mc_curr->data[-1] = yp->code_exit;
-+
-+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+ p0->x = MC_DUMMY_X;
-+ p0->y = MC_DUMMY_Y;
-+ p0->base = s->qpu_dummy_frame_qpu;
-+ p1->x = MC_DUMMY_X;
-+ p1->y = MC_DUMMY_Y;
-+ p1->base = s->qpu_dummy_frame_qpu;
-+
-+ yp->last_l0 = NULL;
-+ yp->last_l1 = NULL;
-+
-+ // Add to mailbox list
-+ mail[i][0] = ipe->gptr.vc + ((uint8_t *)yp->qpu_mc_base - ipe->gptr.arm);
-+ mail[i][1] = yp->code_setup;
-+ }
-+
-+ // We don't need invalidate here as the uniforms aren't changed by the QPU
-+ // and leaving them in ARM cache avoids (pointless) pre-reads when writing
-+ // new values which seems to give us a small performance advantage
-+ //
-+ // In most cases we will not have a completely packed set of uniforms and as
-+ // we have a 2d invalidate we writeback all uniform Qs to the depth of the
-+ // fullest
-+ rpi_cache_flush_add_gm_blocks(rfe, &ipe->gptr, RPI_CACHE_FLUSH_MODE_WRITEBACK,
-+ (uint8_t *)ipe->q[0].qpu_mc_base - ipe->gptr.arm, max_block,
-+ ipe->n, ipe->max_fill + ipe->min_gap);
-+ vpu_qpu_job_add_qpu(vqj, ipe->n, (uint32_t *)mail);
-+
-+ return 1;
-+}
-+#endif
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+static unsigned int mc_terminate_add_emu(const HEVCRpiContext * const s,
-+ const vpu_qpu_job_h vqj,
-+ rpi_cache_flush_env_t * const rfe,
-+ HEVCRpiInterPredEnv * const ipe)
-+{
-+ unsigned int i;
-+ if (!ipe->used) {
-+ return 0;
-+ }
-+
-+ if (ipe->curr != 0) {
-+ rpi_inter_pred_sync(ipe);
-+ }
-+
-+ // Add final commands to Q
-+ for(i = 0; i != ipe->n; ++i) {
-+ HEVCRpiInterPredQ * const yp = ipe->q + i;
-+ qpu_mc_src_t *const p0 = yp->last_l0;
-+ qpu_mc_src_t *const p1 = yp->last_l1;
-+
-+ yp->qpu_mc_curr->data[-1] = yp->code_exit;
-+
-+ // Need to set the srcs for L0 & L1 to something that can be (pointlessly) prefetched
-+ p0->x = MC_DUMMY_X;
-+ p0->y = MC_DUMMY_Y;
-+ p0->base = s->qpu_dummy_frame_emu;
-+ p1->x = MC_DUMMY_X;
-+ p1->y = MC_DUMMY_Y;
-+ p1->base = s->qpu_dummy_frame_emu;
-+
-+ yp->last_l0 = NULL;
-+ yp->last_l1 = NULL;
-+ }
-+
-+ return 1;
-+}
-+#endif
-+
-+
-+#if RPI_QPU_EMU_Y
-+#define mc_terminate_add_y mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_y mc_terminate_add_qpu
-+#endif
-+#if RPI_QPU_EMU_C
-+#define mc_terminate_add_c mc_terminate_add_emu
-+#else
-+#define mc_terminate_add_c mc_terminate_add_qpu
-+#endif
-+
-+
-+static void flush_frame(HEVCRpiContext *s,AVFrame *frame)
-+{
-+ rpi_cache_buf_t cbuf;
-+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+ rpi_cache_flush_add_frame(rfe, frame, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+ rpi_cache_flush_finish(rfe);
-+}
-+
-+static void job_gen_bounds(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+ const unsigned int rs0 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_first];
-+ const unsigned int rs1 = s->ps.pps->ctb_addr_ts_to_rs[jb->ctu_ts_last];
-+ const unsigned int ctb_width = s->ps.sps->ctb_width;
-+ RpiBlk *const bounds = &jb->bounds;
-+ av_assert1(jb->ctu_ts_first <= jb->ctu_ts_last);
-+ bounds->x = (rs0 % ctb_width) << s->ps.sps->log2_ctb_size;
-+ bounds->y = (rs0 / ctb_width) << s->ps.sps->log2_ctb_size;
-+ bounds->w = ((rs1 - rs0) % ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+ bounds->h = ((rs1 - rs0) / ctb_width + 1) << s->ps.sps->log2_ctb_size;
-+}
-+
-+#if RPI_PASSES == 2
-+static void worker_core2(HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+ // Perform intra prediction and residual reconstruction
-+ rpi_execute_pred_cmds(s, jb);
-+
-+ // Perform deblocking for CTBs in this row
-+ rpi_execute_dblk_cmds(s, jb);
-+}
-+#endif
-+
-+// Core execution tasks
-+static void worker_core(const HEVCRpiContext * const s, HEVCRpiJob * const jb)
-+{
-+ int pred_y, pred_c;
-+ vpu_qpu_job_env_t qvbuf;
-+ const vpu_qpu_job_h vqj = vpu_qpu_job_init(&qvbuf);
-+#if RPI_WORKER_WAIT_PASS_0
-+ int do_wait;
-+#endif
-+
-+ {
-+ const HEVCRpiCoeffsEnv * const cf = &jb->coeffs;
-+ if (cf->s[3].n + cf->s[2].n != 0)
-+ {
-+ const unsigned int csize = sizeof(cf->s[3].buf[0]);
-+ const unsigned int offset32 = ((cf->s[3].buf - cf->s[2].buf) - cf->s[3].n) * csize;
-+ unsigned int n16 = (cf->s[2].n >> 8);
-+ unsigned int n32 = (cf->s[3].n >> 10);
-+#if RPI_COMPRESS_COEFFS
-+ if (cf->s[2].packed) {
-+ n16 = n16 | (n16<<16);
-+ } else {
-+ const unsigned int npack16 = (cf->s[2].packed_n>>8);
-+ n16 = n16 | (npack16<<16);
-+ }
-+ if (cf->s[3].packed) {
-+ n32 = n32 | (n32<<16);
-+ } else {
-+ const unsigned int npack32 = (cf->s[3].packed_n>>10);
-+ n32 = n32 | (npack32<<16);
-+ }
-+#endif
-+ vpu_qpu_job_add_vpu(vqj,
-+ vpu_get_fn(s->ps.sps->bit_depth),
-+ vpu_get_constants(),
-+ cf->gptr.vc,
-+ n16,
-+ cf->gptr.vc + offset32,
-+ n32,
-+ 0);
-+
-+ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, 0, cf->s[2].n * csize);
-+ rpi_cache_flush_add_gm_range(jb->rfe, &cf->gptr, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE, offset32, cf->s[3].n * csize);
-+ }
-+ }
-+
-+ pred_c = mc_terminate_add_c(s, vqj, jb->rfe, &jb->chroma_ip);
-+
-+// We could take a sync here and try to locally overlap QPU processing with ARM
-+// but testing showed a slightly negative benefit with noticable extra complexity
-+
-+ pred_y = mc_terminate_add_y(s, vqj, jb->rfe, &jb->luma_ip);
-+
-+ // Returns 0 if nothing to do, 1 if sync added
-+#if RPI_WORKER_WAIT_PASS_0
-+ do_wait = vpu_qpu_job_add_sync_sem(vqj, &jb->sem);
-+#else
-+ if (vpu_qpu_job_add_sync_sem(vqj, &jb->sem) == 0)
-+ sem_post(&jb->sem);
-+#endif
-+
-+ rpi_cache_flush_execute(jb->rfe);
-+
-+ // Await progress as required
-+ // jb->waited will only be clear if we have already tested the progress values
-+ // (in worker_submit_job) and found we don't have to wait
-+ if (jb->waited)
-+ {
-+ unsigned int i;
-+ for (i = 0; i != FF_ARRAY_ELEMS(jb->progress_req); ++i) {
-+ if (jb->progress_req[i] >= 0) {
-+ ff_hevc_rpi_progress_wait_recon(s, jb, s->DPB + i, jb->progress_req[i]);
-+ }
-+ }
-+ }
-+
-+ vpu_qpu_job_finish(vqj);
-+
-+ // We always work on a rectangular block
-+ if (pred_y || pred_c)
-+ {
-+ rpi_cache_flush_add_frame_block(jb->rfe, s->frame, RPI_CACHE_FLUSH_MODE_INVALIDATE,
-+ jb->bounds.x, jb->bounds.y, jb->bounds.w, jb->bounds.h,
-+ ctx_vshift(s, 1), pred_y, pred_c);
-+ }
-+
-+ // If we have emulated VPU ops - do it here
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+ if (av_rpi_is_sand8_frame(s->frame))
-+ {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+ ff_hevc_rpi_shader_c8(s, &jb->luma_ip, NULL);
-+#else
-+ ff_hevc_rpi_shader_c8(s, NULL, &jb->chroma_ip);
-+#endif
-+ }
-+ else
-+ {
-+#if RPI_QPU_EMU_Y && RPI_QPU_EMU_C
-+ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, &jb->chroma_ip);
-+#elif RPI_QPU_EMU_Y
-+ ff_hevc_rpi_shader_c16(s, &jb->luma_ip, NULL);
-+#else
-+ ff_hevc_rpi_shader_c16(s, NULL, &jb->chroma_ip);
-+#endif
-+ }
-+#endif
-+
-+#if RPI_WORKER_WAIT_PASS_0
-+ if (do_wait)
-+ rpi_sem_wait(&jb->sem);
-+ rpi_cache_flush_execute(jb->rfe);
-+#endif
-+}
-+
-+
-+static void rpi_free_inter_pred(HEVCRpiInterPredEnv * const ipe)
-+{
-+ av_freep(&ipe->q);
-+ gpu_free(&ipe->gptr);
-+}
-+
-+static HEVCRpiJob * job_new(void)
-+{
-+ HEVCRpiJob * const jb = av_mallocz(sizeof(HEVCRpiJob));
-+
-+ sem_init(&jb->sem, 0, 0);
-+ jb->rfe = rpi_cache_flush_init(&jb->flush_buf);
-+ ff_hevc_rpi_progress_init_wait(&jb->progress_wait);
-+
-+ jb->intra.n = 0;
-+ jb->intra.cmds = av_mallocz(sizeof(HEVCPredCmd) * RPI_MAX_PRED_CMDS);
-+
-+ // * Sizeof the union structure might be overkill but at the moment it
-+ // is correct (it certainly isn't going to be too small)
-+ // Set max fill to slack/2 from the end of the Q
-+ // If we exceed this in any Q then we will schedule by size (which should
-+ // mean that we never use that Q again part from syncs)
-+ // * Given how agressive the overflow resonse is we could maybe put the
-+ // threshold even nearer the end, but I don't expect us to ever hit
-+ // it on any real stream anyway.
-+
-+ rpi_inter_pred_alloc(&jb->chroma_ip,
-+ QPU_N_MAX, QPU_N_GRP,
-+ QPU_C_COMMANDS * sizeof(qpu_mc_pred_c_t) + QPU_C_SYNCS * sizeof(uint32_t),
-+ QPU_C_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_c_t) / 2);
-+ rpi_inter_pred_alloc(&jb->luma_ip,
-+ QPU_N_MAX, QPU_N_GRP,
-+ QPU_Y_COMMANDS * sizeof(qpu_mc_pred_y_t) + QPU_Y_SYNCS * sizeof(uint32_t),
-+ QPU_Y_CMD_SLACK_PER_Q * sizeof(qpu_mc_pred_y_t) / 2);
-+
-+ return jb;
-+}
-+
-+static void job_delete(HEVCRpiJob * const jb)
-+{
-+ worker_pic_free_one(jb);
-+ ff_hevc_rpi_progress_kill_wait(&jb->progress_wait);
-+ av_freep(&jb->intra.cmds);
-+ rpi_free_inter_pred(&jb->chroma_ip);
-+ rpi_free_inter_pred(&jb->luma_ip);
-+ rpi_cache_flush_finish(jb->rfe); // Not really needed - should do nothing
-+ sem_destroy(&jb->sem);
-+ av_free(jb);
-+}
-+
-+static void jbg_delete(HEVCRpiJobGlobal * const jbg)
-+{
-+ HEVCRpiJob * jb;
-+
-+ if (jbg == NULL)
-+ return;
-+
-+ jb = jbg->free1;
-+ while (jb != NULL)
-+ {
-+ HEVCRpiJob * const jb2 = jb;
-+ jb = jb2->next;
-+ job_delete(jb2);
-+ }
-+
-+ pthread_mutex_destroy(&jbg->lock);
-+ av_free(jbg);
-+}
-+
-+static HEVCRpiJobGlobal * jbg_new(unsigned int job_count)
-+{
-+ HEVCRpiJobGlobal * const jbg = av_mallocz(sizeof(HEVCRpiJobGlobal));
-+ if (jbg == NULL)
-+ return NULL;
-+
-+ pthread_mutex_init(&jbg->lock, NULL);
-+
-+ while (job_count-- != 0)
-+ {
-+ HEVCRpiJob * const jb = job_new();
-+ if (jb == NULL)
-+ goto fail;
-+
-+ jb->next = jbg->free1;
-+ jbg->free1 = jb;
-+ }
-+
-+ return jbg;
-+
-+fail:
-+ jbg_delete(jbg);
-+ return NULL;
-+}
-+
-+static void rpi_job_ctl_delete(HEVCRpiJobCtl * const jbc)
-+{
-+ HEVCRpiJobGlobal * jbg;
-+
-+ if (jbc == NULL)
-+ return;
-+
-+ jbg = jbc->jbg;
-+
-+ if (jbc->jb1 != NULL)
-+ job_delete(jbc->jb1);
-+
-+ pthread_mutex_destroy(&jbc->in_lock);
-+ sem_destroy(&jbc->sem_out);
-+ av_free(jbc);
-+
-+ // Deref the global job context
-+ if (jbg != NULL && atomic_fetch_add(&jbg->ref_count, -1) == 1)
-+ jbg_delete(jbg);
-+}
-+
-+static HEVCRpiJobCtl * rpi_job_ctl_new(HEVCRpiJobGlobal *const jbg)
-+{
-+ HEVCRpiJobCtl * const jbc = av_mallocz(sizeof(HEVCRpiJobCtl));
-+
-+ if (jbc == NULL)
-+ return NULL;
-+
-+ jbc->jbg = jbg;
-+ atomic_fetch_add(&jbg->ref_count, 1);
-+
-+ sem_init(&jbc->sem_out, 0, RPI_MAX_JOBS);
-+ pthread_mutex_init(&jbc->in_lock, NULL);
-+
-+ if ((jbc->jb1 = job_new()) == NULL)
-+ goto fail;
-+ jbc->jb1->jbc_local = jbc;
-+
-+ return jbc;
-+
-+fail:
-+ rpi_job_ctl_delete(jbc);
-+ return NULL;
-+}
-+
-+
-+
-+static av_cold void hevc_init_worker(HEVCRpiContext * const s)
-+{
-+#if RPI_PASSES == 2
-+ pass_queue_init(s->passq + 1, s, worker_core2, &s->jbc->sem_out, 1);
-+#elif RPI_PASSES == 3
-+ pass_queue_init(s->passq + 2, s, rpi_execute_dblk_cmds, &s->jbc->sem_out, 2);
-+ pass_queue_init(s->passq + 1, s, rpi_execute_pred_cmds, &s->passq[2].sem_in, 1);
-+#else
-+#error Passes confused
-+#endif
-+ pass_queue_init(s->passq + 0, s, worker_core, &s->passq[1].sem_in, 0);
-+
-+ pass_queues_start_all(s);
-+}
-+
-+static av_cold void hevc_exit_worker(HEVCRpiContext *s)
-+{
-+ pass_queues_term_all(s);
-+
-+ pass_queues_kill_all(s);
-+
-+ rpi_job_ctl_delete(s->jbc);
-+ s->jbc = NULL;
-+}
-+
-+
-+static int slice_start(const HEVCRpiContext * const s, HEVCRpiLocalContext *const lc)
-+{
-+ const int ctb_addr_ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+ const int tiles = s->ps.pps->num_tile_rows * s->ps.pps->num_tile_columns;
-+ const unsigned int tile_id = s->ps.pps->tile_id[ctb_addr_ts];
-+
-+ // Check for obvious disasters
-+ if (ctb_addr_ts == 0 && s->sh.dependent_slice_segment_flag) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Impossible initial tile.\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ // If dependant then ctb_addr_ts != 0 from previous check
-+ if (s->sh.dependent_slice_segment_flag) {
-+ int prev_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts - 1];
-+ if (s->tab_slice_address[prev_rs] != s->sh.slice_addr) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Previous slice segment missing\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+
-+ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+ tile_id + s->sh.num_entry_point_offsets >= tiles)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Entry points exceed tiles\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ // Tiled stuff must start at start of tile if it has multiple entry points
-+ if (!s->ps.pps->entropy_coding_sync_enabled_flag &&
-+ s->sh.num_entry_point_offsets != 0 &&
-+ ctb_addr_ts != s->ps.pps->tile_pos_ts[tile_id])
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Multiple tiles in slice; slice start != tile start\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ ff_hevc_rpi_cabac_init_decoder(lc);
-+
-+ // Setup any required decode vars
-+ lc->cabac_init_req = !s->sh.dependent_slice_segment_flag;
-+
-+// printf("SS: req=%d, sol=%d, sot=%d\n", lc->cabac_init_req, sol, sot);
-+ lc->qp_y = s->sh.slice_qp;
-+
-+ // General setup
-+ lc->bt_line_no = 0;
-+ lc->ts = ctb_addr_ts;
-+ return 0;
-+}
-+
-+static int gen_entry_points(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+ const GetBitContext * const gb = &s->HEVClc->gb;
-+ RpiSliceHeader * const sh = &s->sh;
-+ int i, j;
-+
-+ const unsigned int length = nal->size;
-+ unsigned int offset = ((gb->index) >> 3) + 1; // We have a bit & align still to come = +1 byte
-+ unsigned int cmpt;
-+ unsigned int startheader;
-+
-+ if (sh->num_entry_point_offsets == 0) {
-+ s->data = NULL;
-+ return 0;
-+ }
-+
-+ // offset in slice header includes emulation prevention bytes.
-+ // Unfortunately those have been removed by the time we get here so we
-+ // have to compensate. The nal layer keeps a track of where they were.
-+ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[0]; j < nal->skipped_bytes; j++) {
-+ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+ startheader--;
-+ cmpt++;
-+ }
-+ }
-+
-+ for (i = 1; i < sh->num_entry_point_offsets; i++) {
-+ offset += (sh->entry_point_offset[i - 1] - cmpt);
-+ for (j = 0, cmpt = 0, startheader = offset + sh->entry_point_offset[i]; j < nal->skipped_bytes; j++) {
-+ if (nal->skipped_bytes_pos[j] >= offset && nal->skipped_bytes_pos[j] < startheader) {
-+ startheader--;
-+ cmpt++;
-+ }
-+ }
-+ if (sh->entry_point_offset[i] <= cmpt) {
-+ av_log(s->avctx, AV_LOG_ERROR, "entry point offset <= skipped bytes\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ sh->size[i - 1] = sh->entry_point_offset[i] - cmpt;
-+ sh->offset[i - 1] = offset;
-+ }
-+
-+ offset += sh->entry_point_offset[sh->num_entry_point_offsets - 1] - cmpt;
-+ if (length < offset) {
-+ av_log(s->avctx, AV_LOG_ERROR, "entry_point_offset table is corrupted\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ sh->size[sh->num_entry_point_offsets - 1] = length - offset;
-+ sh->offset[sh->num_entry_point_offsets - 1] = offset;
-+
-+ // Remember data start pointer as we won't have nal later
-+ s->data = nal->data;
-+ return 0;
-+}
-+
-+
-+// Return
-+// < 0 Error
-+// 0 OK
-+//
-+// jb->ctu_ts_last < 0 Job still filling
-+// jb->ctu_ts_last >= 0 Job ready
-+
-+static int fill_job(HEVCRpiContext * const s, HEVCRpiLocalContext *const lc, unsigned int max_blocks)
-+{
-+ const unsigned int log2_ctb_size = s->ps.sps->log2_ctb_size;
-+ const unsigned int ctb_size = (1 << log2_ctb_size);
-+ HEVCRpiJob * const jb = lc->jb0;
-+ int more_data = 1;
-+ unsigned int ctb_addr_ts = lc->ts;
-+ unsigned int ctb_addr_rs = s->ps.pps->ctb_addr_ts_to_rs[ctb_addr_ts];
-+ unsigned int x_ctb = (ctb_addr_rs % s->ps.sps->ctb_width) << log2_ctb_size;
-+ const unsigned int y_ctb = (ctb_addr_rs / s->ps.sps->ctb_width) << log2_ctb_size;
-+
-+ lc->unit_done = 0;
-+
-+ while (more_data && ctb_addr_ts < s->ps.sps->ctb_size)
-+ {
-+ int q_full;
-+ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[ctb_addr_ts];
-+
-+ hls_decode_neighbour(s, lc, x_ctb, y_ctb, ctb_addr_ts);
-+
-+ ff_hevc_rpi_cabac_init(s, lc, ctb_flags);
-+
-+ hls_sao_param(s, lc, x_ctb >> log2_ctb_size, y_ctb >> log2_ctb_size);
-+
-+ s->deblock[ctb_addr_rs].beta_offset = s->sh.beta_offset;
-+ s->deblock[ctb_addr_rs].tc_offset = s->sh.tc_offset;
-+ s->filter_slice_edges[ctb_addr_rs] = s->sh.slice_loop_filter_across_slices_enabled_flag;
-+
-+ // Zap stashes if navail
-+ if ((lc->ctb_avail & AVAIL_U) == 0)
-+ zap_cabac_stash(s->cabac_stash_up + (x_ctb >> 3), log2_ctb_size - 3);
-+ if ((lc->ctb_avail & AVAIL_L) == 0)
-+ {
-+ memset(lc->ipm_left, INTRA_DC, IPM_TAB_SIZE);
-+ zap_cabac_stash(s->cabac_stash_left + (y_ctb >> 3), log2_ctb_size - 3);
-+ }
-+#if MVF_STASH_WIDTH > 64
-+ // Restore left mvf stash at start of tile if not at start of line
-+ if ((ctb_flags & CTB_TS_FLAGS_SOTL) != 0 && x_ctb != 0 && !s->is_irap)
-+ {
-+ unsigned int i;
-+ HEVCRpiMvField * dst = mvf_stash_ptr(s, lc, x_ctb - 1, 0);
-+ const HEVCRpiMvField * src = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+ {
-+ *dst = *src++;
-+ dst += MVF_STASH_WIDTH_PU;
-+ }
-+ }
-+#endif
-+
-+ // Set initial tu states
-+ lc->tu.cu_qp_delta = 0;
-+ lc->tu.is_cu_qp_delta_wanted = 0;
-+ lc->tu.cu_chroma_qp_offset_wanted = 0;
-+
-+ // Decode
-+ more_data = hls_coding_quadtree(s, lc, x_ctb, y_ctb, log2_ctb_size, 0);
-+
-+ if (ff_hevc_rpi_cabac_overflow(lc))
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Quadtree bitstream overread\n ");
-+ more_data = AVERROR_INVALIDDATA;
-+ }
-+
-+ if (more_data < 0) {
-+ s->tab_slice_address[ctb_addr_rs] = TAB_SLICE_ADDR_BROKEN; // Mark slice as broken
-+ return more_data;
-+ }
-+
-+ if (more_data && ((ctb_flags & CTB_TS_FLAGS_EOT) != 0 ||
-+ (s->ps.pps->entropy_coding_sync_enabled_flag && (ctb_flags & CTB_TS_FLAGS_EOTL) != 0)))
-+ {
-+ if (ff_hevc_rpi_get_cabac_terminate(&lc->cc) < 0 ||
-+ ff_hevc_rpi_cabac_skip_bytes(&lc->cc, 0) == NULL)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "Error reading terminate el\n ");
-+ return -1;
-+ }
-+ }
-+
-+ // --- Post CTB processing
-+
-+ // Stash rpl top/left for deblock that needs to remember such things cross-slice
-+ s->rpl_up[x_ctb >> log2_ctb_size] = s->refPicList;
-+ s->rpl_left[y_ctb >> log2_ctb_size] = s->refPicList;
-+
-+ if (!s->is_irap)
-+ {
-+ // Copy MVF up to up-left & stash to up
-+ {
-+ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb, ctb_size - 1);
-+ HEVCRpiMvField * dst = s->mvf_up + (x_ctb >> LOG2_MIN_PU_SIZE);
-+
-+ // printf("Stash: %d,%d, ctb_size=%d, %p->%p\n", x_ctb, y_ctb, ctb_size, src, dst);
-+
-+ lc->mvf_ul[0] = dst[(ctb_size - 1) >> LOG2_MIN_PU_SIZE];
-+ memcpy(dst, src, (sizeof(*src)*ctb_size) >> LOG2_MIN_PU_SIZE);
-+ }
-+ // Stash sideways if end of tile line but not end of line (no point)
-+ // ** Could/should do this @ end of fn
-+#if MVF_STASH_WIDTH > 64
-+ if ((ctb_flags & (CTB_TS_FLAGS_EOTL | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOTL)
-+#endif
-+ {
-+ unsigned int i;
-+ const HEVCRpiMvField * src = mvf_stash_ptr(s, lc, x_ctb + ctb_size - 1, 0);
-+ HEVCRpiMvField * dst = s->mvf_left + (y_ctb >> LOG2_MIN_PU_SIZE);
-+ for (i = 0; i != ctb_size >> LOG2_MIN_PU_SIZE; ++i)
-+ {
-+ *dst++ = *src;
-+ src += MVF_STASH_WIDTH_PU;
-+ }
-+ }
-+ }
-+
-+ if ((ctb_flags & CTB_TS_FLAGS_CSAVE) != 0)
-+ ff_hevc_rpi_save_states(s, lc);
-+
-+ // Report progress so we can use our MVs in other frames
-+ if ((ctb_flags & CTB_TS_FLAGS_EOL) != 0)
-+ ff_hevc_rpi_progress_signal_mv(s, y_ctb + ctb_size - 1);
-+
-+ // End of line || End of tile line || End of tile
-+ // (EoL covers end of frame for our purposes here)
-+ q_full = ((ctb_flags & CTB_TS_FLAGS_EOTL) != 0);
-+
-+ // Allocate QPU chunks on fixed size 64 pel boundries rather than
-+ // whatever ctb_size is today.
-+ // * We might quite like to continue to 64 pel vertical too but that
-+ // currently confuses WPP
-+ if (((x_ctb + ctb_size) & 63) == 0 || q_full)
-+ {
-+ int overflow = 0;
-+ if (rpi_inter_pred_next_ctu(&jb->luma_ip) != 0)
-+ overflow = 1;
-+ if (rpi_inter_pred_next_ctu(&jb->chroma_ip) != 0)
-+ overflow = 1;
-+ if (overflow)
-+ {
-+ // * This is very annoying (and slow) to cope with in WPP so
-+ // we treat it as an error there (no known stream triggers this
-+ // with the current buffer sizes). Non-wpp should cope fine.
-+ av_log(s->avctx, AV_LOG_WARNING, "%s: Q full before EoL\n", __func__);
-+ q_full = 1;
-+ }
-+ }
-+
-+ // Inc TS to next.
-+ ctb_addr_ts++;
-+ ctb_addr_rs++;
-+ x_ctb += ctb_size;
-+
-+ if (q_full)
-+ {
-+ // Do job
-+ // Prep for submission
-+ jb->ctu_ts_last = ctb_addr_ts - 1; // Was pre-inced
-+ job_gen_bounds(s, jb);
-+ break;
-+ }
-+
-+ // If max_blocks started as 0 then this will never be true
-+ if (--max_blocks == 0)
-+ break;
-+ }
-+
-+ lc->unit_done = (more_data <= 0);
-+ lc->ts = ctb_addr_ts;
-+ return 0;
-+}
-+
-+static void bt_lc_init(HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, const unsigned int n)
-+{
-+ lc->context = s;
-+ lc->jb0 = NULL;
-+ lc->lc_n = n;
-+ lc->bt_terminate = 0;
-+ lc->bt_psem_out = NULL;
-+ sem_init(&lc->bt_sem_in, 0, 0);
-+}
-+
-+#define TRACE_WPP 0
-+#if RPI_EXTRA_BIT_THREADS > 0
-+static inline unsigned int line_ts_width(const HEVCRpiContext * const s, unsigned int ts)
-+{
-+ unsigned int rs = s->ps.pps->ctb_addr_ts_to_rs[ts];
-+ return s->ps.pps->column_width[s->ps.pps->col_idxX[rs % s->ps.sps->ctb_width]];
-+}
-+
-+// Move local context parameters from an aux bit thread back to the main
-+// thread at the end of a slice as processing is going to continue there.
-+static void movlc(HEVCRpiLocalContext *const dst_lc, HEVCRpiLocalContext *const src_lc, const int is_dep)
-+{
-+ if (src_lc == dst_lc) {
-+ return;
-+ }
-+
-+ // Move the job
-+ // We will still have an active job if the final line terminates early
-+ // Dest should always be null by now
-+ av_assert1(dst_lc->jb0 == NULL);
-+ dst_lc->jb0 = src_lc->jb0;
-+ src_lc->jb0 = NULL;
-+
-+ // Always need to store where we are in the bitstream
-+ dst_lc->ts = src_lc->ts;
-+ dst_lc->gb = src_lc->gb;
-+ // Cabac init request will be built at start of next slice
-+
-+ // Need to store context if we might have a dependent seg
-+ if (is_dep)
-+ {
-+ dst_lc->qPy_pred = src_lc->qPy_pred;
-+ memcpy(dst_lc->ipm_left, src_lc->ipm_left, sizeof(src_lc->ipm_left));
-+ memcpy(dst_lc->cabac_state, src_lc->cabac_state, sizeof(src_lc->cabac_state));
-+ memcpy(dst_lc->stat_coeff, src_lc->stat_coeff, sizeof(src_lc->stat_coeff));
-+ }
-+}
-+
-+static inline int wait_bt_sem_in(HEVCRpiLocalContext * const lc)
-+{
-+ rpi_sem_wait(&lc->bt_sem_in);
-+ return lc->bt_terminate;
-+}
-+
-+// Do one WPP line
-+// Will not work correctly over horizontal tile boundries - vertical should be OK
-+static int rpi_run_one_line(HEVCRpiContext *const s, HEVCRpiLocalContext * const lc, const int is_first)
-+{
-+ const int is_tile = lc->bt_is_tile;
-+ const unsigned int tile_id = s->ps.pps->tile_id[lc->ts];
-+ const unsigned int line = lc->bt_line_no;
-+ const unsigned int line_inc = lc->bt_line_inc;
-+ const int is_last = (line >= lc->bt_last_line);
-+
-+ const unsigned int ts_eol = lc->ts + (is_tile ? s->ps.pps->tile_size[tile_id] : lc->bt_line_width);
-+ const unsigned int ts_next =
-+ line + line_inc > (unsigned int)s->sh.num_entry_point_offsets ?
-+ INT_MAX :
-+ is_tile ?
-+ s->ps.pps->tile_pos_ts[tile_id + line_inc] :
-+ lc->ts + lc->bt_line_width * line_inc;
-+ // Tile wants line, WPP a few CTUs (must be >= 2 for cabac context to work)
-+ const unsigned int partial_size = is_tile ? line_ts_width(s, lc->ts) : 2;
-+ unsigned int ts_prev;
-+ int loop_n = 0;
-+ int err = 0;
-+
-+ av_assert1(line <= s->sh.num_entry_point_offsets);
-+
-+#if TRACE_WPP
-+ printf("%s[%d]: Start %s: tile=%d, line=%d/%d/%d, ts=%d/%d/%d, width=%d, jb=%p\n", __func__,
-+ lc->lc_n, is_tile ? "Tile" : "WPP", tile_id,
-+ line, lc->bt_last_line, s->sh.num_entry_point_offsets,
-+ lc->ts, ts_eol, ts_next, partial_size, lc->jb0);
-+#endif
-+ if (line != 0)
-+ {
-+ const uint8_t * const data = s->data + s->sh.offset[line - 1];
-+ const unsigned int len = s->sh.size[line - 1];
-+ if ((err = init_get_bits8(&lc->gb, data, len)) < 0)
-+ return err;
-+
-+ ff_init_cabac_decoder(&lc->cc, data, len);
-+ }
-+
-+ // We should never be processing a dependent slice here so reset is good
-+ // ?? These probably shouldn't be needed (as they should be set by later
-+ // logic) but do seem to be required
-+ lc->qp_y = s->sh.slice_qp;
-+
-+ do
-+ {
-+ if (!is_last && loop_n > 1) {
-+#if TRACE_WPP
-+ printf("%s[%d]: %sPoke %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", lc->bt_psem_out);
-+#endif
-+ sem_post(lc->bt_psem_out);
-+ }
-+ // The wait for loop_n == 0 has been done in bit_thread
-+ if (!is_first && loop_n != 0)
-+ {
-+#if TRACE_WPP
-+ printf("%s[%d]: %sWait %p\n", __func__, lc->lc_n, err == 0 ? "" : "ERR: ", &lc->bt_sem_in);
-+#endif
-+ if (wait_bt_sem_in(lc) != 0)
-+ return AVERROR_EXIT;
-+ }
-+
-+#if TRACE_WPP
-+ {
-+ int n;
-+ sem_getvalue(&lc->bt_sem_in, &n);
-+ printf("%s[%d]: ts=%d, sem=%d %p\n", __func__, lc->lc_n, lc->ts, n, &lc->bt_sem_in);
-+ }
-+#endif
-+
-+ ts_prev = lc->ts;
-+
-+ // If we have had an error - do no further decode but do continue
-+ // moving signals around so the other threads continue to operate
-+ // correctly (or at least as correctly as they can with this line missing)
-+ //
-+ // Errors in WPP/Tile are less fatal than normal as we have a good idea
-+ // of how to restart on the next line so there is no need to give up totally
-+ if (err != 0)
-+ {
-+ lc->unit_done = 0;
-+ lc->ts += partial_size;
-+ }
-+ else
-+ {
-+ worker_pass0_ready(s, lc);
-+
-+ if ((err = fill_job(s, lc, partial_size)) < 0 ||
-+ (lc->ts < ts_eol && !is_last && (lc->ts != ts_prev + partial_size || lc->unit_done)))
-+ {
-+ if (err == 0) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Unexpected end of tile/wpp section\n");
-+ err = AVERROR_INVALIDDATA;
-+ }
-+ worker_free(s, lc);
-+ lc->ts = ts_prev + partial_size; // Pretend we did all that
-+ lc->unit_done = 0;
-+ }
-+ else if (is_tile)
-+ {
-+ worker_submit_job(s, lc);
-+ }
-+ }
-+
-+ ++loop_n;
-+ } while (lc->ts < ts_eol && !lc->unit_done);
-+
-+ // If we are on the last line & we didn't get a whole line we must wait for
-+ // and sink the sem_posts from the line above / tile to the left.
-+ while ((ts_prev += partial_size) < ts_eol)
-+ {
-+#if TRACE_WPP
-+ printf("%s[%d]: EOL Wait: ts=%d %p\n", __func__, lc->lc_n, ts_prev, &lc->bt_sem_in);
-+#endif
-+ if (wait_bt_sem_in(lc) != 0)
-+ return AVERROR_EXIT;
-+ }
-+
-+ lc->bt_line_no += line_inc;
-+
-+ if (!is_tile && err == 0)
-+ worker_submit_job(s, lc);
-+
-+ if (!is_last) {
-+ lc->ts = ts_next;
-+
-+#if TRACE_WPP
-+ printf("%s[%d]: Poke post submit %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+ sem_post(lc->bt_psem_out);
-+ if (loop_n > 1) {
-+#if TRACE_WPP
-+ printf("%s[%d]: Poke post submit2 %p\n", __func__, lc->lc_n, lc->bt_psem_out);
-+#endif
-+ sem_post(lc->bt_psem_out);
-+ }
-+ }
-+ else
-+ {
-+ movlc(s->HEVClcList[0], lc, s->ps.pps->dependent_slice_segments_enabled_flag); // * & not EoT
-+#if MVF_STASH_WIDTH > 64
-+ // Horrid calculations to work out what we want but luckily this should almost never execute
-+ // **** Move to movlc
-+ if (!s->is_irap)
-+ {
-+ const unsigned int ctb_flags = s->ps.pps->ctb_ts_flags[lc->ts];
-+ if ((ctb_flags & CTB_TS_FLAGS_EOTL) == 0) // If EOTL then we have already stashed mvf
-+ {
-+ const unsigned int x_ctb = ((s->ps.pps->ctb_addr_ts_to_rs[lc->ts] % s->ps.sps->ctb_width) << s->ps.sps->log2_ctb_size) - 1;
-+ unsigned int i;
-+ const HEVCRpiMvField *s_mvf = lc->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+ HEVCRpiMvField *d_mvf = s->HEVClcList[0]->mvf_stash + ((x_ctb >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1));
-+
-+ for (i = 0; i != MVF_STASH_HEIGHT_PU; ++i)
-+ {
-+ *d_mvf = *s_mvf;
-+ d_mvf += MVF_STASH_WIDTH_PU;
-+ s_mvf += MVF_STASH_WIDTH_PU;
-+ }
-+
-+ }
-+ }
-+#endif
-+ // When all done poke the thread 0 sem_in one final time
-+#if TRACE_WPP
-+ printf("%s[%d]: Poke final %p\n", __func__, lc->lc_n, &s->HEVClcList[0]->bt_sem_in);
-+#endif
-+ sem_post(&s->HEVClcList[0]->bt_sem_in);
-+ }
-+
-+#if TRACE_WPP
-+ printf("%s[%d]: End. dep=%d\n", __func__, lc->lc_n, s->ps.pps->dependent_slice_segments_enabled_flag);
-+#endif
-+ return err;
-+}
-+
-+static void wpp_setup_lcs(HEVCRpiContext * const s)
-+{
-+ unsigned int ts = s->ps.pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+ const unsigned int line_width = line_ts_width(s, ts);
-+
-+ for (int i = 0; i <= s->sh.num_entry_point_offsets && i < RPI_BIT_THREADS; ++i)
-+ {
-+ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+ lc->ts = ts;
-+ lc->bt_is_tile = 0;
-+ lc->bt_line_no = i;
-+ lc->bt_line_width = line_width;
-+ lc->bt_last_line = s->sh.num_entry_point_offsets;
-+ lc->bt_line_inc = RPI_BIT_THREADS;
-+ ts += line_width;
-+ }
-+}
-+
-+
-+// Can only process tile single row at once
-+static void tile_one_row_setup_lcs(HEVCRpiContext * const s, unsigned int slice_row)
-+{
-+ const HEVCRpiPPS * const pps = s->ps.pps;
-+ const unsigned int ts0 = pps->ctb_addr_rs_to_ts[s->sh.slice_segment_addr];
-+ const unsigned int tile0 = pps->tile_id[ts0];
-+ const unsigned int col0 = tile0 % pps->num_tile_columns;
-+
-+ const unsigned int col = (slice_row == 0) ? col0 : 0;
-+ unsigned int line = slice_row * pps->num_tile_columns - col0 + col;
-+ const unsigned int last_line = FFMIN(
-+ line + pps->num_tile_columns - 1 - col, s->sh.num_entry_point_offsets);
-+
-+ const unsigned int par =
-+ FFMIN(RPI_BIT_THREADS, last_line + 1 - line);
-+#if TRACE_WPP
-+ printf("ts0=%d, ents=%d, row=%d, tiles=%dx%d, col=%d, par=%d, line=%d/%d\n", ts0, s->sh.num_entry_point_offsets, slice_row,
-+ pps->num_tile_columns, pps->num_tile_rows, col, par, line, last_line);
-+#endif
-+ for (unsigned int i = 0; i != par; ++i, ++line)
-+ {
-+ HEVCRpiLocalContext * const lc = s->HEVClcList[i];
-+ const unsigned int tile = tile0 + line;
-+
-+ lc->ts = pps->tile_pos_ts[tile];
-+ lc->bt_line_no = line;
-+ lc->bt_is_tile = 1;
-+ lc->bt_line_width = line_ts_width(s, lc->ts);
-+ lc->bt_last_line = last_line;
-+ lc->bt_line_inc = par;
-+ }
-+}
-+
-+
-+static void * bit_thread(void * v)
-+{
-+ HEVCRpiLocalContext * const lc = v;
-+ HEVCRpiContext *const s = lc->context;
-+
-+ while (wait_bt_sem_in(lc) == 0)
-+ {
-+ int err;
-+
-+ if ((err = rpi_run_one_line(s, lc, 0)) < 0) { // Never first tile/wpp
-+ if (lc->bt_terminate) {
-+ av_log(s->avctx, AV_LOG_ERROR, "%s: Unexpected termination\n", __func__);
-+ break;
-+ }
-+ av_log(s->avctx, AV_LOG_WARNING, "%s: Decode failure: %d\n", __func__, err);
-+ }
-+ }
-+
-+ return NULL;
-+}
-+
-+static int bit_threads_start(HEVCRpiContext * const s)
-+{
-+ if (s->bt_started)
-+ return 0;
-+
-+ for (int i = 1; i < RPI_BIT_THREADS; ++i)
-+ {
-+ // lc[0] belongs to the main thread - this sets up lc[1..RPI_BIT_THREADS]
-+ if (s->HEVClcList[i] == NULL) {
-+ if ((s->HEVClcList[i] = av_mallocz(sizeof(*s->HEVClcList[0]))) == NULL)
-+ return -1;
-+ }
-+
-+ bt_lc_init(s, s->HEVClcList[i], i);
-+ job_lc_init(s->HEVClcList[i]);
-+ }
-+
-+ // Link the sems in a circle
-+ for (int i = 0; i < RPI_BIT_THREADS - 1; ++i)
-+ s->HEVClcList[i]->bt_psem_out = &s->HEVClcList[i + 1]->bt_sem_in;
-+ s->HEVClcList[RPI_BIT_THREADS - 1]->bt_psem_out = &s->HEVClcList[0]->bt_sem_in;
-+
-+ // Init all lc before starting any threads
-+ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+ {
-+ if (pthread_create(s->bit_threads + i, NULL, bit_thread, s->HEVClcList[i + 1]) < 0)
-+ return -1;
-+ }
-+
-+ s->bt_started = 1;
-+ return 0;
-+}
-+
-+static int bit_threads_kill(HEVCRpiContext * const s)
-+{
-+ if (!s->bt_started)
-+ return 0;
-+ s->bt_started = 0;
-+
-+ for (int i = 0; i < RPI_EXTRA_BIT_THREADS; ++i)
-+ {
-+ HEVCRpiLocalContext *const lc = s->HEVClcList[i + 1];
-+ if (lc == NULL)
-+ break;
-+
-+ lc->bt_terminate = 1;
-+ sem_post(&lc->bt_sem_in);
-+ pthread_join(s->bit_threads[i], NULL);
-+
-+ sem_destroy(&lc->bt_sem_in);
-+ job_lc_kill(lc);
-+ }
-+ return 0;
-+}
-+#endif
-+
-+
-+// If we are at EoT and the row is shorter than the number of jobs
-+// we can Q we have to wait for it finish otherwise we risk cache/QPU
-+// disasters
-+static inline int tile_needs_wait(const HEVCRpiContext * const s, const int n)
-+{
-+ return
-+ s->ps.pps->tile_wpp_inter_disable >= 2 &&
-+ s->sh.slice_type != HEVC_SLICE_I &&
-+ n >= 0 &&
-+ (s->ps.pps->ctb_ts_flags[n] & (CTB_TS_FLAGS_EOT | CTB_TS_FLAGS_EOL)) == CTB_TS_FLAGS_EOT;
-+}
-+
-+static int rpi_decode_entry(AVCodecContext *avctxt, void *isFilterThread)
-+{
-+ HEVCRpiContext * const s = avctxt->priv_data;
-+ HEVCRpiLocalContext * const lc = s->HEVClc;
-+ int err;
-+
-+ // Start of slice
-+ if ((err = slice_start(s, lc)) != 0)
-+ return err;
-+
-+#if RPI_EXTRA_BIT_THREADS > 0
-+
-+ if (s->sh.offload_tiles)
-+ {
-+ unsigned int slice_row = 0;
-+
-+#if TRACE_WPP
-+ printf("%s: Do Tiles\n", __func__);
-+#endif
-+ // Generate & start extra bit threads if they aren't already running
-+ bit_threads_start(s);
-+
-+ do
-+ {
-+ // Reset lc lines etc.
-+ tile_one_row_setup_lcs(s, slice_row);
-+
-+#if TRACE_WPP
-+ printf("%s: Row %d: Do 1st: line=%d/%d/%d\n",
-+ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
-+#if TRACE_WPP
-+ printf("%s: Row %d: Done 1st: line=%d/%d/%d\n",
-+ __func__, slice_row, lc->bt_line_no, lc->bt_last_line, s->sh.num_entry_point_offsets);
-+#endif
-+
-+ while (lc->bt_line_no <= lc->bt_last_line) {
-+ rpi_sem_wait(&lc->bt_sem_in);
-+ rpi_run_one_line(s, lc, 0);
-+ }
-+#if TRACE_WPP
-+ printf("%s: Done body\n", __func__);
-+#endif
-+
-+ // Wait for everything else to finish
-+ rpi_sem_wait(&lc->bt_sem_in);
-+
-+ ++slice_row;
-+ } while (lc->bt_last_line < s->sh.num_entry_point_offsets);
-+
-+
-+#if TRACE_WPP
-+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+ }
-+ else if (s->sh.offload_wpp)
-+ {
-+#if TRACE_WPP
-+ printf("%s: Do WPP\n", __func__);
-+#endif
-+ // Generate & start extra bit threads if they aren't already running
-+ bit_threads_start(s);
-+
-+ // Reset lc lines etc.
-+ wpp_setup_lcs(s);
-+
-+ rpi_run_one_line(s, lc, 1); // Kicks off the other threads
-+#if TRACE_WPP
-+ printf("%s: Done 1st\n", __func__);
-+#endif
-+
-+ while (lc->bt_line_no <= s->sh.num_entry_point_offsets) {
-+ rpi_sem_wait(&lc->bt_sem_in);
-+ rpi_run_one_line(s, lc, 0);
-+ }
-+#if TRACE_WPP
-+ printf("%s: Done body\n", __func__);
-+#endif
-+
-+ // Wait for everything else to finish
-+ rpi_sem_wait(&lc->bt_sem_in);
-+
-+#if TRACE_WPP
-+ printf("%s: Done wait: ts=%d\n", __func__, lc->ts);
-+#endif
-+ }
-+ else
-+#endif
-+ {
-+#if TRACE_WPP
-+ printf("%s: Single start: ts=%d\n", __func__, lc->ts);
-+#endif
-+ // Single bit thread
-+ do {
-+ // Make sure we have space to prepare the next job
-+ worker_pass0_ready(s, lc);
-+
-+ if ((err = fill_job(s, lc, 0)) < 0)
-+ goto fail;
-+
-+ worker_submit_job(s, lc);
-+
-+ if (tile_needs_wait(s, lc->ts - 1))
-+ worker_wait(s, lc);
-+
-+ } while (!lc->unit_done);
-+
-+#if TRACE_WPP
-+ printf("%s: Single end: ts=%d\n", __func__, lc->ts);
-+#endif
-+ }
-+
-+ // If we have reached the end of the frame or
-+ // then wait for the worker to finish all its jobs
-+ if (lc->ts >= s->ps.sps->ctb_size)
-+ worker_wait(s, lc);
-+
-+#if RPI_TSTATS
-+ {
-+ HEVCRpiStats *const ts = &s->tstats;
-+
-+ printf("=== P: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d w8gl:%5d/%5d y8m:%d\n B: xy00:%5d/%5d/%5d/%5d h16gl:%5d/%5d\n",
-+ ts->y_pred1_xy, ts->y_pred1_x0, ts->y_pred1_y0, ts->y_pred1_x0y0,
-+ ts->y_pred1_hgt16, ts->y_pred1_hle16, ts->y_pred1_wgt8, ts->y_pred1_wle8, ts->y_pred1_y8_merge,
-+ ts->y_pred2_xy, ts->y_pred2_x0, ts->y_pred2_y0, ts->y_pred2_x0y0,
-+ ts->y_pred2_hgt16, ts->y_pred2_hle16);
-+ memset(ts, 0, sizeof(*ts));
-+ }
-+#endif
-+
-+ return lc->ts;
-+
-+fail:
-+ // Cleanup
-+ av_log(s->avctx, AV_LOG_ERROR, "%s failed: err=%d\n", __func__, err);
-+ // Free our job & wait for temination
-+ worker_free(s, lc);
-+ worker_wait(s, lc);
-+ return err;
-+}
-+
-+
-+static void set_no_backward_pred(HEVCRpiContext * const s)
-+{
-+ int i, j;
-+ const RefPicList *const refPicList = s->refPicList;
-+
-+ s->no_backward_pred_flag = 0;
-+ if (s->sh.slice_type != HEVC_SLICE_B || !s->sh.slice_temporal_mvp_enabled_flag)
-+ return;
-+
-+ for (j = 0; j < 2; j++) {
-+ for (i = 0; i < refPicList[j].nb_refs; i++) {
-+ if (refPicList[j].list[i] > s->poc) {
-+ s->no_backward_pred_flag = 1;
-+ return;
-+ }
-+ }
-+ }
-+}
-+
-+static int hls_slice_data(HEVCRpiContext * const s, const H2645NAL * const nal)
-+{
-+ int err;
-+ if ((err = gen_entry_points(s, nal)) < 0)
-+ return err;
-+
-+ set_no_backward_pred(s);
-+
-+ return rpi_decode_entry(s->avctx, NULL);
-+}
-+
-+static int set_side_data(HEVCRpiContext *s)
-+{
-+ AVFrame *out = s->ref->frame;
-+
-+ if (s->sei.frame_packing.present &&
-+ s->sei.frame_packing.arrangement_type >= 3 &&
-+ s->sei.frame_packing.arrangement_type <= 5 &&
-+ s->sei.frame_packing.content_interpretation_type > 0 &&
-+ s->sei.frame_packing.content_interpretation_type < 3) {
-+ AVStereo3D *stereo = av_stereo3d_create_side_data(out);
-+ if (!stereo)
-+ return AVERROR(ENOMEM);
-+
-+ switch (s->sei.frame_packing.arrangement_type) {
-+ case 3:
-+ if (s->sei.frame_packing.quincunx_subsampling)
-+ stereo->type = AV_STEREO3D_SIDEBYSIDE_QUINCUNX;
-+ else
-+ stereo->type = AV_STEREO3D_SIDEBYSIDE;
-+ break;
-+ case 4:
-+ stereo->type = AV_STEREO3D_TOPBOTTOM;
-+ break;
-+ case 5:
-+ stereo->type = AV_STEREO3D_FRAMESEQUENCE;
-+ break;
-+ }
-+
-+ if (s->sei.frame_packing.content_interpretation_type == 2)
-+ stereo->flags = AV_STEREO3D_FLAG_INVERT;
-+
-+ if (s->sei.frame_packing.arrangement_type == 5) {
-+ if (s->sei.frame_packing.current_frame_is_frame0_flag)
-+ stereo->view = AV_STEREO3D_VIEW_LEFT;
-+ else
-+ stereo->view = AV_STEREO3D_VIEW_RIGHT;
-+ }
-+ }
-+
-+ if (s->sei.display_orientation.present &&
-+ (s->sei.display_orientation.anticlockwise_rotation ||
-+ s->sei.display_orientation.hflip || s->sei.display_orientation.vflip)) {
-+ double angle = s->sei.display_orientation.anticlockwise_rotation * 360 / (double) (1 << 16);
-+ AVFrameSideData *rotation = av_frame_new_side_data(out,
-+ AV_FRAME_DATA_DISPLAYMATRIX,
-+ sizeof(int32_t) * 9);
-+ if (!rotation)
-+ return AVERROR(ENOMEM);
-+
-+ av_display_rotation_set((int32_t *)rotation->data, angle);
-+ av_display_matrix_flip((int32_t *)rotation->data,
-+ s->sei.display_orientation.hflip,
-+ s->sei.display_orientation.vflip);
-+ }
-+
-+ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+ // so the side data persists for the entire coded video sequence.
-+ if (s->sei.mastering_display.present > 0 &&
-+ IS_IRAP(s) && s->no_rasl_output_flag) {
-+ s->sei.mastering_display.present--;
-+ }
-+ if (s->sei.mastering_display.present) {
-+ // HEVC uses a g,b,r ordering, which we convert to a more natural r,g,b
-+ const int mapping[3] = {2, 0, 1};
-+ const int chroma_den = 50000;
-+ const int luma_den = 10000;
-+ int i;
-+ AVMasteringDisplayMetadata *metadata =
-+ av_mastering_display_metadata_create_side_data(out);
-+ if (!metadata)
-+ return AVERROR(ENOMEM);
-+
-+ for (i = 0; i < 3; i++) {
-+ const int j = mapping[i];
-+ metadata->display_primaries[i][0].num = s->sei.mastering_display.display_primaries[j][0];
-+ metadata->display_primaries[i][0].den = chroma_den;
-+ metadata->display_primaries[i][1].num = s->sei.mastering_display.display_primaries[j][1];
-+ metadata->display_primaries[i][1].den = chroma_den;
-+ }
-+ metadata->white_point[0].num = s->sei.mastering_display.white_point[0];
-+ metadata->white_point[0].den = chroma_den;
-+ metadata->white_point[1].num = s->sei.mastering_display.white_point[1];
-+ metadata->white_point[1].den = chroma_den;
-+
-+ metadata->max_luminance.num = s->sei.mastering_display.max_luminance;
-+ metadata->max_luminance.den = luma_den;
-+ metadata->min_luminance.num = s->sei.mastering_display.min_luminance;
-+ metadata->min_luminance.den = luma_den;
-+ metadata->has_luminance = 1;
-+ metadata->has_primaries = 1;
-+
-+ av_log(s->avctx, AV_LOG_DEBUG, "Mastering Display Metadata:\n");
-+ av_log(s->avctx, AV_LOG_DEBUG,
-+ "r(%5.4f,%5.4f) g(%5.4f,%5.4f) b(%5.4f %5.4f) wp(%5.4f, %5.4f)\n",
-+ av_q2d(metadata->display_primaries[0][0]),
-+ av_q2d(metadata->display_primaries[0][1]),
-+ av_q2d(metadata->display_primaries[1][0]),
-+ av_q2d(metadata->display_primaries[1][1]),
-+ av_q2d(metadata->display_primaries[2][0]),
-+ av_q2d(metadata->display_primaries[2][1]),
-+ av_q2d(metadata->white_point[0]), av_q2d(metadata->white_point[1]));
-+ av_log(s->avctx, AV_LOG_DEBUG,
-+ "min_luminance=%f, max_luminance=%f\n",
-+ av_q2d(metadata->min_luminance), av_q2d(metadata->max_luminance));
-+ }
-+ // Decrement the mastering display flag when IRAP frame has no_rasl_output_flag=1
-+ // so the side data persists for the entire coded video sequence.
-+ if (s->sei.content_light.present > 0 &&
-+ IS_IRAP(s) && s->no_rasl_output_flag) {
-+ s->sei.content_light.present--;
-+ }
-+ if (s->sei.content_light.present) {
-+ AVContentLightMetadata *metadata =
-+ av_content_light_metadata_create_side_data(out);
-+ if (!metadata)
-+ return AVERROR(ENOMEM);
-+ metadata->MaxCLL = s->sei.content_light.max_content_light_level;
-+ metadata->MaxFALL = s->sei.content_light.max_pic_average_light_level;
-+
-+ av_log(s->avctx, AV_LOG_DEBUG, "Content Light Level Metadata:\n");
-+ av_log(s->avctx, AV_LOG_DEBUG, "MaxCLL=%d, MaxFALL=%d\n",
-+ metadata->MaxCLL, metadata->MaxFALL);
-+ }
-+
-+ if (s->sei.a53_caption.a53_caption) {
-+ AVFrameSideData* sd = av_frame_new_side_data(out,
-+ AV_FRAME_DATA_A53_CC,
-+ s->sei.a53_caption.a53_caption_size);
-+ if (sd)
-+ memcpy(sd->data, s->sei.a53_caption.a53_caption, s->sei.a53_caption.a53_caption_size);
-+ av_freep(&s->sei.a53_caption.a53_caption);
-+ s->sei.a53_caption.a53_caption_size = 0;
-+ s->avctx->properties |= FF_CODEC_PROPERTY_CLOSED_CAPTIONS;
-+ }
-+
-+ if (s->sei.alternative_transfer.present &&
-+ av_color_transfer_name(s->sei.alternative_transfer.preferred_transfer_characteristics) &&
-+ s->sei.alternative_transfer.preferred_transfer_characteristics != AVCOL_TRC_UNSPECIFIED) {
-+ s->avctx->color_trc = out->color_trc = s->sei.alternative_transfer.preferred_transfer_characteristics;
-+ }
-+
-+ return 0;
-+}
-+
-+static int hevc_frame_start(HEVCRpiContext * const s)
-+{
-+ int ret;
-+
-+ memset(s->bs_horizontal, 0, s->bs_size * 2); // Does V too
-+ memset(s->is_pcm, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+ memset(s->tab_slice_address, -1, s->ps.sps->ctb_size * sizeof(*s->tab_slice_address));
-+
-+ // Only need to remember intra for CIP
-+ if (!s->ps.pps->constrained_intra_pred_flag || s->is_irap)
-+ s->is_intra = NULL;
-+ else
-+ {
-+ s->is_intra = s->is_intra_store;
-+ memset(s->is_intra, 0, s->ps.sps->pcm_width * s->ps.sps->pcm_height);
-+ }
-+
-+ s->is_decoded = 0;
-+ s->first_nal_type = s->nal_unit_type;
-+
-+ s->no_rasl_output_flag = IS_IDR(s) || IS_BLA(s) || (s->nal_unit_type == HEVC_NAL_CRA_NUT && s->last_eos);
-+
-+ if (s->pkt.nb_nals > s->rpl_tab_size)
-+ {
-+ // In most cases it will be faster to free & realloc as that doesn't
-+ // require (an unwanted) copy
-+ av_freep(&s->rpl_tab);
-+ s->rpl_tab_size = 0;
-+ if ((s->rpl_tab = av_malloc(s->pkt.nb_nals * sizeof(*s->rpl_tab))) == NULL)
-+ goto fail;
-+ s->rpl_tab_size = s->pkt.nb_nals;
-+ }
-+ memset(s->rpl_tab, 0, s->pkt.nb_nals * sizeof(*s->rpl_tab));
-+
-+ ret = ff_hevc_rpi_set_new_ref(s, &s->frame, s->poc);
-+ if (ret < 0)
-+ goto fail;
-+
-+ // Resize rpl_tab to max that we might want
-+ ret = ff_hevc_rpi_frame_rps(s);
-+ if (ret < 0) {
-+ av_log(s->avctx, AV_LOG_ERROR, "Error constructing the frame RPS.\n");
-+ goto fail;
-+ }
-+
-+ s->ref->frame->key_frame = IS_IRAP(s);
-+
-+ ret = set_side_data(s);
-+ if (ret < 0)
-+ goto fail;
-+
-+ s->frame->pict_type = 3 - s->sh.slice_type;
-+
-+ if (!IS_IRAP(s))
-+ ff_hevc_rpi_bump_frame(s);
-+
-+ av_frame_unref(s->output_frame);
-+ ret = ff_hevc_rpi_output_frame(s, s->output_frame, 0);
-+ if (ret < 0)
-+ goto fail;
-+
-+ ff_thread_finish_setup(s->avctx);
-+
-+ return 0;
-+
-+fail:
-+ if (s->ref)
-+ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+ s->ref = NULL;
-+ return ret;
-+}
-+
-+static int decode_nal_unit(HEVCRpiContext *s, const H2645NAL *nal)
-+{
-+ GetBitContext * const gb = &s->HEVClc->gb;
-+ int ctb_addr_ts, ret;
-+
-+ *gb = nal->gb;
-+ s->nal_unit_type = nal->type;
-+ s->temporal_id = nal->temporal_id;
-+
-+ switch (s->nal_unit_type) {
-+ case HEVC_NAL_VPS:
-+ ret = ff_hevc_rpi_decode_nal_vps(gb, s->avctx, &s->ps);
-+ if (ret < 0)
-+ goto fail;
-+ break;
-+ case HEVC_NAL_SPS:
-+ ret = ff_hevc_rpi_decode_nal_sps(gb, s->avctx, &s->ps,
-+ s->apply_defdispwin);
-+ if (ret < 0)
-+ goto fail;
-+ break;
-+ case HEVC_NAL_PPS:
-+ ret = ff_hevc_rpi_decode_nal_pps(gb, s->avctx, &s->ps);
-+ if (ret < 0)
-+ goto fail;
-+ break;
-+ case HEVC_NAL_SEI_PREFIX:
-+ case HEVC_NAL_SEI_SUFFIX:
-+ ret = ff_hevc_rpi_decode_nal_sei(gb, s->avctx, &s->sei, &s->ps, s->nal_unit_type);
-+ if (ret < 0)
-+ goto fail;
-+ break;
-+ case HEVC_NAL_TRAIL_R:
-+ case HEVC_NAL_TRAIL_N:
-+ case HEVC_NAL_TSA_N:
-+ case HEVC_NAL_TSA_R:
-+ case HEVC_NAL_STSA_N:
-+ case HEVC_NAL_STSA_R:
-+ case HEVC_NAL_BLA_W_LP:
-+ case HEVC_NAL_BLA_W_RADL:
-+ case HEVC_NAL_BLA_N_LP:
-+ case HEVC_NAL_IDR_W_RADL:
-+ case HEVC_NAL_IDR_N_LP:
-+ case HEVC_NAL_CRA_NUT:
-+ case HEVC_NAL_RADL_N:
-+ case HEVC_NAL_RADL_R:
-+ case HEVC_NAL_RASL_N:
-+ case HEVC_NAL_RASL_R:
-+ ret = hls_slice_header(s);
-+ if (ret < 0)
-+ return ret;
-+
-+ // The definition of _N unit types is "non-reference for other frames
-+ // with the same temporal_id" so they may/will be ref frames for pics
-+ // with a higher temporal_id.
-+ s->used_for_ref = s->ps.sps->max_sub_layers > s->temporal_id + 1 ||
-+ !(s->nal_unit_type == HEVC_NAL_TRAIL_N ||
-+ s->nal_unit_type == HEVC_NAL_TSA_N ||
-+ s->nal_unit_type == HEVC_NAL_STSA_N ||
-+ s->nal_unit_type == HEVC_NAL_RADL_N ||
-+ s->nal_unit_type == HEVC_NAL_RASL_N);
-+ s->offload_recon = s->threads_type != 0 && s->used_for_ref;
-+ s->is_irap = IS_IRAP(s);
-+
-+#if DEBUG_DECODE_N
-+ {
-+ static int z = 0;
-+ if (IS_IDR(s)) {
-+ z = 1;
-+ }
-+ if (z != 0 && z++ > DEBUG_DECODE_N) {
-+ s->is_decoded = 0;
-+ break;
-+ }
-+ }
-+#endif
-+ if (
-+ (s->avctx->skip_frame >= AVDISCARD_NONREF && !s->used_for_ref) ||
-+ (s->avctx->skip_frame >= AVDISCARD_BIDIR && s->sh.slice_type == HEVC_SLICE_B) ||
-+ (s->avctx->skip_frame >= AVDISCARD_NONINTRA && s->sh.slice_type != HEVC_SLICE_I) ||
-+ (s->avctx->skip_frame >= AVDISCARD_NONKEY && !IS_IDR(s)))
-+ {
-+ s->is_decoded = 0;
-+ break;
-+ }
-+
-+ if (s->sh.first_slice_in_pic_flag) {
-+ if (s->max_ra == INT_MAX) {
-+ if (s->nal_unit_type == HEVC_NAL_CRA_NUT || IS_BLA(s)) {
-+ s->max_ra = s->poc;
-+ } else {
-+ if (IS_IDR(s))
-+ s->max_ra = INT_MIN;
-+ }
-+ }
-+
-+ if ((s->nal_unit_type == HEVC_NAL_RASL_R || s->nal_unit_type == HEVC_NAL_RASL_N) &&
-+ s->poc <= s->max_ra) {
-+ s->is_decoded = 0;
-+ break;
-+ } else {
-+ if (s->nal_unit_type == HEVC_NAL_RASL_R && s->poc > s->max_ra)
-+ s->max_ra = INT_MIN;
-+ }
-+
-+ ret = hevc_frame_start(s);
-+ if (ret < 0)
-+ return ret;
-+ } else if (!s->ref) {
-+ av_log(s->avctx, AV_LOG_ERROR, "First slice in a frame missing.\n");
-+ goto fail;
-+ }
-+
-+ if (s->nal_unit_type != s->first_nal_type) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Non-matching NAL types of the VCL NALUs: %d %d\n",
-+ s->first_nal_type, s->nal_unit_type);
-+ return AVERROR_INVALIDDATA;
-+ }
-+
-+ if (!s->sh.dependent_slice_segment_flag &&
-+ s->sh.slice_type != HEVC_SLICE_I) {
-+ ret = ff_hevc_rpi_slice_rpl(s);
-+ if (ret < 0) {
-+ av_log(s->avctx, AV_LOG_WARNING,
-+ "Error constructing the reference lists for the current slice.\n");
-+ goto fail;
-+ }
-+ }
-+
-+ ctb_addr_ts = hls_slice_data(s, nal);
-+ if (ctb_addr_ts >= s->ps.sps->ctb_size) {
-+ s->is_decoded = 1;
-+ }
-+
-+ if (ctb_addr_ts < 0) {
-+ ret = ctb_addr_ts;
-+ goto fail;
-+ }
-+ break;
-+ case HEVC_NAL_EOS_NUT:
-+ case HEVC_NAL_EOB_NUT:
-+ s->seq_decode = (s->seq_decode + 1) & 0xff;
-+ s->max_ra = INT_MAX;
-+ break;
-+ case HEVC_NAL_AUD:
-+ case HEVC_NAL_FD_NUT:
-+ break;
-+ default:
-+ av_log(s->avctx, AV_LOG_INFO,
-+ "Skipping NAL unit %d\n", s->nal_unit_type);
-+ }
-+
-+ return 0;
-+fail:
-+ if (s->avctx->err_recognition & AV_EF_EXPLODE)
-+ return ret;
-+ return 0;
-+}
-+
-+static int decode_nal_units(HEVCRpiContext *s, const uint8_t *buf, int length)
-+{
-+ int i, ret = 0;
-+ int eos_at_start = 1;
-+
-+ s->ref = NULL;
-+ s->last_eos = s->eos;
-+ s->eos = 0;
-+
-+ /* split the input packet into NAL units, so we know the upper bound on the
-+ * number of slices in the frame */
-+ ret = ff_h2645_packet_split(&s->pkt, buf, length, s->avctx, s->is_nalff,
-+ s->nal_length_size, s->avctx->codec_id, 0);
-+ if (ret < 0) {
-+ av_log(s->avctx, AV_LOG_ERROR,
-+ "Error splitting the input into NAL units.\n");
-+ return ret;
-+ }
-+
-+ for (i = 0; i < s->pkt.nb_nals; i++) {
-+ if (s->pkt.nals[i].type == HEVC_NAL_EOB_NUT ||
-+ s->pkt.nals[i].type == HEVC_NAL_EOS_NUT) {
-+ if (eos_at_start) {
-+ s->last_eos = 1;
-+ } else {
-+ s->eos = 1;
-+ }
-+ } else {
-+ eos_at_start = 0;
-+ }
-+ }
-+
-+ /* decode the NAL units */
-+ for (i = 0; i < s->pkt.nb_nals; i++) {
-+ ret = decode_nal_unit(s, &s->pkt.nals[i]);
-+ if (ret < 0) {
-+ av_log(s->avctx, AV_LOG_WARNING,
-+ "Error parsing NAL unit #%d.\n", i);
-+ goto fail;
-+ }
-+ }
-+
-+fail: // Also success path
-+ if (s->ref != NULL) {
-+ if (s->used_for_ref && s->threads_type != 0) {
-+ ff_hevc_rpi_progress_signal_all_done(s);
-+ }
-+ else {
-+ // Flush frame to real memory as we expect to be able to pass
-+ // it straight on to mmal
-+ flush_frame(s, s->frame);
-+ }
-+ }
-+ return ret;
-+}
-+
-+static void print_md5(void *log_ctx, int level, uint8_t md5[16])
-+{
-+ int i;
-+ for (i = 0; i < 16; i++)
-+ av_log(log_ctx, level, "%02"PRIx8, md5[i]);
-+}
-+
-+static int verify_md5(HEVCRpiContext *s, AVFrame *frame)
-+{
-+ const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(frame->format);
-+ int pixel_shift;
-+ int i, j;
-+
-+ if (!desc)
-+ return AVERROR(EINVAL);
-+
-+ pixel_shift = desc->comp[0].depth > 8;
-+
-+ av_log(s->avctx, AV_LOG_DEBUG, "Verifying checksum for frame with POC %d: ",
-+ s->poc);
-+
-+ /* the checksums are LE, so we have to byteswap for >8bpp formats
-+ * on BE arches */
-+#if HAVE_BIGENDIAN
-+ if (pixel_shift && !s->checksum_buf) {
-+ av_fast_malloc(&s->checksum_buf, &s->checksum_buf_size,
-+ FFMAX3(frame->linesize[0], frame->linesize[1],
-+ frame->linesize[2]));
-+ if (!s->checksum_buf)
-+ return AVERROR(ENOMEM);
-+ }
-+#endif
-+
-+ for (i = 0; frame->data[i]; i++) {
-+ int width = s->avctx->coded_width;
-+ int height = s->avctx->coded_height;
-+ int w = (i == 1 || i == 2) ? (width >> desc->log2_chroma_w) : width;
-+ int h = (i == 1 || i == 2) ? (height >> desc->log2_chroma_h) : height;
-+ uint8_t md5[16];
-+
-+ av_md5_init(s->md5_ctx);
-+ for (j = 0; j < h; j++) {
-+ const uint8_t *src = frame->data[i] + j * frame_stride1(frame, 1);
-+#if HAVE_BIGENDIAN
-+ if (pixel_shift) {
-+ s->bdsp.bswap16_buf((uint16_t *) s->checksum_buf,
-+ (const uint16_t *) src, w);
-+ src = s->checksum_buf;
-+ }
-+#endif
-+ av_md5_update(s->md5_ctx, src, w << pixel_shift);
-+ }
-+ av_md5_final(s->md5_ctx, md5);
-+
-+ if (!memcmp(md5, s->sei.picture_hash.md5[i], 16)) {
-+ av_log (s->avctx, AV_LOG_DEBUG, "plane %d - correct ", i);
-+ print_md5(s->avctx, AV_LOG_DEBUG, md5);
-+ av_log (s->avctx, AV_LOG_DEBUG, "; ");
-+ } else {
-+ av_log (s->avctx, AV_LOG_ERROR, "mismatching checksum of plane %d - ", i);
-+ print_md5(s->avctx, AV_LOG_ERROR, md5);
-+ av_log (s->avctx, AV_LOG_ERROR, " != ");
-+ print_md5(s->avctx, AV_LOG_ERROR, s->sei.picture_hash.md5[i]);
-+ av_log (s->avctx, AV_LOG_ERROR, "\n");
-+ return AVERROR_INVALIDDATA;
-+ }
-+ }
-+
-+ av_log(s->avctx, AV_LOG_DEBUG, "\n");
-+
-+ return 0;
-+}
-+
-+static int all_sps_supported(const HEVCRpiContext * const s)
-+{
-+ for (unsigned int i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+ if (s->ps.sps_list[i] != NULL)
-+ {
-+ const HEVCRpiSPS * const sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+ if (!is_sps_supported(sps))
-+ return 0;
-+ }
-+ }
-+ return 1;
-+}
-+
-+static int hevc_rpi_decode_extradata(HEVCRpiContext *s, uint8_t *buf, int length, int first)
-+{
-+ int ret, i;
-+
-+ ret = ff_hevc_rpi_decode_extradata(buf, length, &s->ps, &s->sei, &s->is_nalff,
-+ &s->nal_length_size, s->avctx->err_recognition,
-+ s->apply_defdispwin, s->avctx);
-+ if (ret < 0)
-+ return ret;
-+
-+ /* export stream parameters from the first SPS */
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+ if (first && s->ps.sps_list[i]) {
-+ const HEVCRpiSPS *sps = (const HEVCRpiSPS*)s->ps.sps_list[i]->data;
-+ export_stream_params(s->avctx, &s->ps, sps);
-+ break;
-+ }
-+ }
-+
-+ return 0;
-+}
-+
-+static int hevc_rpi_decode_frame(AVCodecContext *avctx, void *data, int *got_output,
-+ AVPacket *avpkt)
-+{
-+ int ret;
-+ int new_extradata_size;
-+ uint8_t *new_extradata;
-+ HEVCRpiContext *s = avctx->priv_data;
-+
-+ if (!avpkt->size) {
-+ ret = ff_hevc_rpi_output_frame(s, data, 1);
-+ if (ret < 0)
-+ return ret;
-+
-+ *got_output = ret;
-+ return 0;
-+ }
-+
-+ new_extradata = av_packet_get_side_data(avpkt, AV_PKT_DATA_NEW_EXTRADATA,
-+ &new_extradata_size);
-+ if (new_extradata && new_extradata_size > 0) {
-+ ret = hevc_rpi_decode_extradata(s, new_extradata, new_extradata_size, 0);
-+ if (ret < 0)
-+ return ret;
-+ }
-+
-+ s->ref = NULL;
-+ ret = decode_nal_units(s, avpkt->data, avpkt->size);
-+ if (ret < 0)
-+ return ret;
-+
-+ /* verify the SEI checksum */
-+ if (avctx->err_recognition & AV_EF_CRCCHECK && s->is_decoded &&
-+ s->sei.picture_hash.is_md5) {
-+ ret = verify_md5(s, s->ref->frame);
-+ if (ret < 0 && avctx->err_recognition & AV_EF_EXPLODE) {
-+ ff_hevc_rpi_unref_frame(s, s->ref, ~0);
-+ return ret;
-+ }
-+ }
-+ s->sei.picture_hash.is_md5 = 0;
-+
-+ if (s->is_decoded) {
-+ av_log(avctx, AV_LOG_DEBUG, "Decoded frame with POC %d.\n", s->poc);
-+ s->is_decoded = 0;
-+ }
-+
-+ if (s->output_frame->buf[0]) {
-+ av_frame_move_ref(data, s->output_frame);
-+ *got_output = 1;
-+ }
-+
-+ return avpkt->size;
-+}
-+
-+static int hevc_ref_frame(HEVCRpiContext *s, HEVCRpiFrame *dst, HEVCRpiFrame *src)
-+{
-+ int ret;
-+
-+ ret = ff_thread_ref_frame(&dst->tf, &src->tf);
-+ if (ret < 0)
-+ return ret;
-+
-+ if (src->col_mvf_buf != NULL)
-+ {
-+ dst->col_mvf_buf = av_buffer_ref(src->col_mvf_buf);
-+ if (!dst->col_mvf_buf)
-+ goto fail;
-+ }
-+ dst->col_mvf = src->col_mvf;
-+
-+ dst->poc = src->poc;
-+ dst->flags = src->flags;
-+ dst->sequence = src->sequence;
-+ return 0;
-+
-+fail:
-+ ff_hevc_rpi_unref_frame(s, dst, ~0);
-+ return AVERROR(ENOMEM);
-+}
-+
-+
-+static av_cold int hevc_decode_free(AVCodecContext *avctx)
-+{
-+ HEVCRpiContext * const s = avctx->priv_data;
-+ int i;
-+
-+ pic_arrays_free(s);
-+
-+ av_freep(&s->md5_ctx);
-+
-+ av_freep(&s->cabac_save);
-+
-+#if RPI_EXTRA_BIT_THREADS
-+ bit_threads_kill(s);
-+#endif
-+
-+ hevc_exit_worker(s);
-+ vpu_qpu_term();
-+ for (i = 0; i != 2; ++i) {
-+ ff_hevc_rpi_progress_kill_state(s->progress_states + i);
-+ }
-+ job_lc_kill(s->HEVClc);
-+
-+ av_freep(&s->sao_pixel_buffer_h[0]); // [1] & [2] allocated with [0]
-+ av_freep(&s->sao_pixel_buffer_v[0]);
-+ av_frame_free(&s->output_frame);
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+ av_frame_free(&s->DPB[i].frame);
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++)
-+ av_buffer_unref(&s->ps.vps_list[i]);
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++)
-+ av_buffer_unref(&s->ps.sps_list[i]);
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++)
-+ av_buffer_unref(&s->ps.pps_list[i]);
-+ s->ps.sps = NULL;
-+ s->ps.pps = NULL;
-+ s->ps.vps = NULL;
-+
-+ // Free separately from sLists as used that way by RPI WPP
-+ for (i = 0; i < MAX_NB_THREADS && s->HEVClcList[i] != NULL; ++i) {
-+ av_freep(s->HEVClcList + i);
-+ }
-+ s->HEVClc = NULL; // Allocated as part of HEVClcList
-+
-+ ff_h2645_packet_uninit(&s->pkt);
-+
-+ // This must be after we free off the DPB
-+ // * If the outer code is still holding any frames hopefully it will
-+ // have its own ref to zc
-+ av_rpi_zc_uninit(avctx);
-+
-+ return 0;
-+}
-+
-+
-+static av_cold int hevc_init_context(AVCodecContext *avctx)
-+{
-+ HEVCRpiContext *s = avctx->priv_data;
-+ int i;
-+
-+ s->avctx = avctx;
-+
-+ s->HEVClc = av_mallocz(sizeof(HEVCRpiLocalContext));
-+ if (!s->HEVClc)
-+ goto fail;
-+ s->HEVClcList[0] = s->HEVClc;
-+
-+ // Whilst FFmpegs init fn is only called once the close fn is called as
-+ // many times as we have threads (init_thread_copy is called for the
-+ // threads). So to match init & term put the init here where it will be
-+ // called by both init & copy
-+ av_rpi_zc_init(avctx);
-+
-+ if (vpu_qpu_init() != 0)
-+ goto fail;
-+
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+ {
-+ static const uint32_t dframe[1] = {0x80808080};
-+ s->qpu_dummy_frame_emu = (const uint8_t *)dframe;
-+ }
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+ s->qpu_dummy_frame_qpu = qpu_dummy();
-+#endif
-+
-+ bt_lc_init(s, s->HEVClc, 0);
-+ job_lc_init(s->HEVClc);
-+
-+ for (i = 0; i != 2; ++i) {
-+ ff_hevc_rpi_progress_init_state(s->progress_states + i);
-+ }
-+
-+ if ((s->cabac_save = av_malloc(sizeof(*s->cabac_save))) == NULL)
-+ goto fail;
-+
-+ if ((s->output_frame = av_frame_alloc()) == NULL)
-+ goto fail;
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ s->DPB[i].frame = av_frame_alloc();
-+ if (!s->DPB[i].frame)
-+ goto fail;
-+ s->DPB[i].tf.f = s->DPB[i].frame;
-+ s->DPB[i].dpb_no = i;
-+ }
-+
-+ s->max_ra = INT_MAX;
-+
-+ if ((s->md5_ctx = av_md5_alloc()) == NULL)
-+ goto fail;
-+
-+ s->context_initialized = 1;
-+ s->eos = 0;
-+
-+ ff_hevc_rpi_reset_sei(&s->sei);
-+
-+ return 0;
-+
-+fail:
-+ av_log(s->avctx, AV_LOG_ERROR, "%s: Failed\n", __func__);
-+ hevc_decode_free(avctx);
-+ return AVERROR(ENOMEM);
-+}
-+
-+static int hevc_update_thread_context(AVCodecContext *dst,
-+ const AVCodecContext *src)
-+{
-+ HEVCRpiContext *s = dst->priv_data;
-+ HEVCRpiContext *s0 = src->priv_data;
-+ int i, ret;
-+
-+ if (!s->context_initialized) {
-+ ret = hevc_init_context(dst);
-+ if (ret < 0)
-+ return ret;
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->DPB); i++) {
-+ ff_hevc_rpi_unref_frame(s, &s->DPB[i], ~0);
-+ if (s0->DPB[i].frame->buf[0]) {
-+ ret = hevc_ref_frame(s, &s->DPB[i], &s0->DPB[i]);
-+ if (ret < 0)
-+ return ret;
-+ }
-+ }
-+
-+ if (s->ps.sps != s0->ps.sps)
-+ s->ps.sps = NULL;
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.vps_list); i++) {
-+ av_buffer_unref(&s->ps.vps_list[i]);
-+ if (s0->ps.vps_list[i]) {
-+ s->ps.vps_list[i] = av_buffer_ref(s0->ps.vps_list[i]);
-+ if (!s->ps.vps_list[i])
-+ return AVERROR(ENOMEM);
-+ }
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.sps_list); i++) {
-+ av_buffer_unref(&s->ps.sps_list[i]);
-+ if (s0->ps.sps_list[i]) {
-+ s->ps.sps_list[i] = av_buffer_ref(s0->ps.sps_list[i]);
-+ if (!s->ps.sps_list[i])
-+ return AVERROR(ENOMEM);
-+ }
-+ }
-+
-+ for (i = 0; i < FF_ARRAY_ELEMS(s->ps.pps_list); i++) {
-+ av_buffer_unref(&s->ps.pps_list[i]);
-+ if (s0->ps.pps_list[i]) {
-+ s->ps.pps_list[i] = av_buffer_ref(s0->ps.pps_list[i]);
-+ if (!s->ps.pps_list[i])
-+ return AVERROR(ENOMEM);
-+ }
-+ }
-+
-+ if (s->ps.sps != s0->ps.sps)
-+ if ((ret = set_sps(s, s0->ps.sps, src->pix_fmt)) < 0)
-+ return ret;
-+
-+ s->seq_decode = s0->seq_decode;
-+ s->seq_output = s0->seq_output;
-+ s->pocTid0 = s0->pocTid0;
-+ s->max_ra = s0->max_ra;
-+ s->eos = s0->eos;
-+ s->no_rasl_output_flag = s0->no_rasl_output_flag;
-+
-+ s->is_nalff = s0->is_nalff;
-+ s->nal_length_size = s0->nal_length_size;
-+
-+ s->threads_type = s0->threads_type;
-+
-+ if (s0->eos) {
-+ s->seq_decode = (s->seq_decode + 1) & 0xff;
-+ s->max_ra = INT_MAX;
-+ }
-+
-+ s->sei.frame_packing = s0->sei.frame_packing;
-+ s->sei.display_orientation = s0->sei.display_orientation;
-+ s->sei.mastering_display = s0->sei.mastering_display;
-+ s->sei.content_light = s0->sei.content_light;
-+ s->sei.alternative_transfer = s0->sei.alternative_transfer;
-+
-+ // * We do this here as it allows us to easily locate our parents
-+ // global job pool, but there really should be a less nasty way
-+ if (s->jbc == NULL)
-+ {
-+ av_assert0((s->jbc = rpi_job_ctl_new(s0->jbc->jbg)) != NULL);
-+ hevc_init_worker(s);
-+ }
-+
-+ return 0;
-+}
-+
-+static av_cold int hevc_decode_init(AVCodecContext *avctx)
-+{
-+ HEVCRpiContext *s = avctx->priv_data;
-+ int ret;
-+
-+ avctx->internal->allocate_progress = 1;
-+
-+ {
-+ HEVCRpiJobGlobal * const jbg = jbg_new(FFMAX(avctx->thread_count * 3, 5));
-+ if (jbg == NULL)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "%s: Job global init failed\n", __func__);
-+ return -1;
-+ }
-+
-+ if ((s->jbc = rpi_job_ctl_new(jbg)) == NULL)
-+ {
-+ av_log(s->avctx, AV_LOG_ERROR, "%s: Job ctl init failed\n", __func__);
-+ return -1;
-+ }
-+ }
-+
-+ ret = hevc_init_context(avctx);
-+ if (ret < 0)
-+ return ret;
-+
-+ hevc_init_worker(s);
-+
-+ s->sei.picture_timing.picture_struct = 0;
-+ s->eos = 1;
-+
-+ atomic_init(&s->wpp_err, 0);
-+
-+ if (avctx->extradata_size > 0 && avctx->extradata) {
-+ ret = hevc_rpi_decode_extradata(s, avctx->extradata, avctx->extradata_size, 1);
-+
-+ if (ret == 0 && !all_sps_supported(s))
-+ ret = AVERROR_DECODER_NOT_FOUND;
-+
-+ if (ret < 0)
-+ {
-+ hevc_decode_free(avctx);
-+ return ret;
-+ }
-+ }
-+
-+ if((avctx->active_thread_type & FF_THREAD_FRAME) && avctx->thread_count > 1)
-+ s->threads_type = FF_THREAD_FRAME;
-+ else
-+ s->threads_type = 0;
-+
-+ return 0;
-+}
-+
-+static av_cold int hevc_init_thread_copy(AVCodecContext *avctx)
-+{
-+ HEVCRpiContext *s = avctx->priv_data;
-+ int ret;
-+
-+ memset(s, 0, sizeof(*s));
-+
-+ ret = hevc_init_context(avctx);
-+ if (ret < 0)
-+ return ret;
-+
-+ return 0;
-+}
-+
-+static void hevc_decode_flush(AVCodecContext *avctx)
-+{
-+ HEVCRpiContext *s = avctx->priv_data;
-+ ff_hevc_rpi_flush_dpb(s);
-+ s->max_ra = INT_MAX;
-+ s->eos = 1;
-+}
-+
-+#define OFFSET(x) offsetof(HEVCRpiContext, x)
-+#define PAR (AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
-+
-+
-+static const AVOption options[] = {
-+ { "apply_defdispwin", "Apply default display window from VUI", OFFSET(apply_defdispwin),
-+ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+ { "strict-displaywin", "stricly apply default display window size", OFFSET(apply_defdispwin),
-+ AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, PAR },
-+ { NULL },
-+};
-+
-+static const AVClass hevc_rpi_decoder_class = {
-+ .class_name = "HEVC RPI decoder",
-+ .item_name = av_default_item_name,
-+ .option = options,
-+ .version = LIBAVUTIL_VERSION_INT,
-+};
-+
-+static const enum AVPixelFormat hevc_rpi_pix_fmts[] = {
-+ AV_PIX_FMT_SAND128,
-+ AV_PIX_FMT_SAND64_10,
-+ AV_PIX_FMT_NONE
-+};
-+
-+//static const AVCodecHWConfigInternal *hevc_rpi_hw_configs[] = {
-+// HW_CONFIG_INTERNAL(HEVC_RPI),
-+// NULL
-+//};
-+
-+
-+AVCodec ff_hevc_rpi_decoder = {
-+ .name = "hevc_rpi",
-+ .long_name = NULL_IF_CONFIG_SMALL("HEVC (rpi)"),
-+ .type = AVMEDIA_TYPE_VIDEO,
-+ .id = AV_CODEC_ID_HEVC,
-+ .priv_data_size = sizeof(HEVCRpiContext),
-+ .priv_class = &hevc_rpi_decoder_class,
-+ .init = hevc_decode_init,
-+ .close = hevc_decode_free,
-+ .decode = hevc_rpi_decode_frame,
-+ .flush = hevc_decode_flush,
-+ .update_thread_context = hevc_update_thread_context,
-+ .init_thread_copy = hevc_init_thread_copy,
-+ .capabilities = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY |
-+// AV_CODEC_CAP_HARDWARE |
-+#if 0
-+ // Debugging is often easier without threads getting in the way
-+ 0,
-+#warning H265 threading turned off
-+#else
-+ // We only have decent optimisation for frame - so only admit to that
-+ AV_CODEC_CAP_FRAME_THREADS,
-+#endif
-+ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_EXPORTS_CROPPING,
-+ .pix_fmts = hevc_rpi_pix_fmts,
-+ .profiles = NULL_IF_CONFIG_SMALL(ff_hevc_profiles),
-+// .hw_configs = hevc_rpi_hw_configs,
-+// .wrapper_name = "hevc_rpi",
-+};
-+
-diff --git a/libavcodec/rpi_hevcdec.h b/libavcodec/rpi_hevcdec.h
-new file mode 100644
-index 0000000000..d324aa273c
---- /dev/null
-+++ b/libavcodec/rpi_hevcdec.h
-@@ -0,0 +1,1087 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDEC_H
-+#define AVCODEC_RPI_HEVCDEC_H
-+
-+#include "config.h"
-+
-+#include <stdatomic.h>
-+
-+#include "libavutil/buffer.h"
-+
-+#include "avcodec.h"
-+#include "bswapdsp.h"
-+#include "cabac.h"
-+#include "get_bits.h"
-+#include "rpi_hevcpred.h"
-+#include "h2645_parse.h"
-+#include "hevc.h"
-+#include "rpi_hevc_mv.h"
-+#include "rpi_hevc_ps.h"
-+#include "rpi_hevc_sei.h"
-+#include "rpi_hevcdsp.h"
-+#include "internal.h"
-+#include "thread.h"
-+#include "videodsp.h"
-+
-+#if ARCH_ARM
-+#include "arm/rpi_hevc_misc_neon.h"
-+#endif
-+
-+#define MAX_NB_THREADS 16
-+#define SHIFT_CTB_WPP 2
-+
-+//TODO: check if this is really the maximum
-+#define MAX_TRANSFORM_DEPTH 5
-+
-+#define MAX_TB_SIZE 32
-+#define MAX_QP 51
-+#define DEFAULT_INTRA_TC_OFFSET 2
-+
-+#define HEVC_CONTEXTS 199
-+
-+#define MRG_MAX_NUM_CANDS 5
-+
-+#define HEVC_MAX_CTB_SIZE (1 << HEVC_MAX_LOG2_CTB_SIZE) // 64
-+
-+// Size of DPB array
-+#define HEVC_DPB_ELS 32
-+
-+#define L0 0
-+#define L1 1
-+
-+#define EPEL_EXTRA_BEFORE 1
-+#define EPEL_EXTRA_AFTER 2
-+#define EPEL_EXTRA 3
-+#define QPEL_EXTRA_BEFORE 3
-+#define QPEL_EXTRA_AFTER 4
-+#define QPEL_EXTRA 7
-+
-+#define EDGE_EMU_BUFFER_STRIDE 80
-+
-+#include <semaphore.h>
-+#include "rpi_qpu.h"
-+
-+// Max jobs per frame thread. Actual usage will be limited by the size
-+// of the global job pool
-+// ?? Limits
-+#define RPI_MAX_JOBS 8
-+
-+// This is the number of _extra_ bit threads - we will have
-+// RPI_EXTRA_BIT_THREADS+1 threads actually doing the processing
-+//
-+// 0 is legitimate and will disable our WPP processing
-+//#define RPI_EXTRA_BIT_THREADS 0
-+#define RPI_EXTRA_BIT_THREADS 2
-+
-+// Number of separate threads/passes in worker
-+// 2 and 3 are the currently valid numbers
-+// At the moment 3 seems fractionally faster
-+//#define RPI_PASSES 2
-+#define RPI_PASSES 3
-+
-+// Print out various usage stats
-+#define RPI_TSTATS 0
-+
-+// Define RPI_COMPRESS_COEFFS to 1 to send coefficients in compressed form
-+#define RPI_COMPRESS_COEFFS 1
-+
-+// Wait for VPU/QPU to finish in worker pass 0
-+// If 0 then the wait is in pass 1
-+//
-+// One might expect the better place to wait would be in pass 1 however
-+// testing shows that pass 0 produces overall faster decode.
-+// Interestingly it is QPU/VPU limited streams that seem to suffer
-+// from pass 1 waits, CPU limited ones tend to show a very mild gain.
-+// This define exists so it is easy to test this.
-+#define RPI_WORKER_WAIT_PASS_0 1
-+
-+// Use ARM emulation of QPU pred
-+// These are for debug only as the emulation makes only limited
-+// effort to be fast
-+#define RPI_QPU_EMU_Y 0
-+#define RPI_QPU_EMU_C 0
-+
-+// Max width & height we are prepared to consider
-+// Sand frame shape calc becomes confused with large frames
-+// Some buffer alloc also depends on this
-+#define HEVC_RPI_MAX_WIDTH 2048
-+#define HEVC_RPI_MAX_HEIGHT 1088
-+
-+
-+// Min CTB size is 16
-+#define HEVC_RPI_MAX_CTBS ((HEVC_RPI_MAX_WIDTH + 15) / 16) * ((HEVC_RPI_MAX_HEIGHT + 15) / 16)
-+
-+/**
-+ * Value of the luma sample at position (x, y) in the 2D array tab.
-+ */
-+#define SAMPLE(tab, x, y) ((tab)[(y) * s->sps->width + (x)])
-+#define SAMPLE_CTB(tab, x, y) ((tab)[(y) * min_cb_width + (x)])
-+
-+#define IS_IDR(s) ((s)->nal_unit_type == HEVC_NAL_IDR_W_RADL || (s)->nal_unit_type == HEVC_NAL_IDR_N_LP)
-+#define IS_BLA(s) ((s)->nal_unit_type == HEVC_NAL_BLA_W_RADL || (s)->nal_unit_type == HEVC_NAL_BLA_W_LP || \
-+ (s)->nal_unit_type == HEVC_NAL_BLA_N_LP)
-+#define IS_IRAP(s) ((s)->nal_unit_type >= 16 && (s)->nal_unit_type <= 23)
-+
-+enum RPSType {
-+ ST_CURR_BEF = 0,
-+ ST_CURR_AFT,
-+ ST_FOLL,
-+ LT_CURR,
-+ LT_FOLL,
-+ NB_RPS_TYPE,
-+};
-+
-+enum SyntaxElement {
-+ SAO_MERGE_FLAG = 0,
-+ SAO_TYPE_IDX,
-+ SAO_EO_CLASS,
-+ SAO_BAND_POSITION,
-+ SAO_OFFSET_ABS,
-+ SAO_OFFSET_SIGN,
-+ END_OF_SLICE_FLAG,
-+ SPLIT_CODING_UNIT_FLAG,
-+ CU_TRANSQUANT_BYPASS_FLAG,
-+ SKIP_FLAG,
-+ CU_QP_DELTA,
-+ PRED_MODE_FLAG,
-+ PART_MODE,
-+ PCM_FLAG,
-+ PREV_INTRA_LUMA_PRED_FLAG,
-+ MPM_IDX,
-+ REM_INTRA_LUMA_PRED_MODE,
-+ INTRA_CHROMA_PRED_MODE,
-+ MERGE_FLAG,
-+ MERGE_IDX,
-+ INTER_PRED_IDC,
-+ REF_IDX_L0,
-+ REF_IDX_L1,
-+ ABS_MVD_GREATER0_FLAG,
-+ ABS_MVD_GREATER1_FLAG,
-+ ABS_MVD_MINUS2,
-+ MVD_SIGN_FLAG,
-+ MVP_LX_FLAG,
-+ NO_RESIDUAL_DATA_FLAG,
-+ SPLIT_TRANSFORM_FLAG,
-+ CBF_LUMA,
-+ CBF_CB_CR,
-+ TRANSFORM_SKIP_FLAG,
-+ EXPLICIT_RDPCM_FLAG,
-+ EXPLICIT_RDPCM_DIR_FLAG,
-+ LAST_SIGNIFICANT_COEFF_X_PREFIX,
-+ LAST_SIGNIFICANT_COEFF_Y_PREFIX,
-+ LAST_SIGNIFICANT_COEFF_X_SUFFIX,
-+ LAST_SIGNIFICANT_COEFF_Y_SUFFIX,
-+ SIGNIFICANT_COEFF_GROUP_FLAG,
-+ SIGNIFICANT_COEFF_FLAG,
-+ COEFF_ABS_LEVEL_GREATER1_FLAG,
-+ COEFF_ABS_LEVEL_GREATER2_FLAG,
-+ COEFF_ABS_LEVEL_REMAINING,
-+ COEFF_SIGN_FLAG,
-+ LOG2_RES_SCALE_ABS,
-+ RES_SCALE_SIGN_FLAG,
-+ CU_CHROMA_QP_OFFSET_FLAG,
-+ CU_CHROMA_QP_OFFSET_IDX,
-+};
-+
-+enum PartMode {
-+ PART_2Nx2N = 0,
-+ PART_2NxN = 1,
-+ PART_Nx2N = 2,
-+ PART_NxN = 3,
-+ PART_2NxnU = 4,
-+ PART_2NxnD = 5,
-+ PART_nLx2N = 6,
-+ PART_nRx2N = 7,
-+};
-+
-+enum PredMode {
-+ MODE_INTER = 0,
-+ MODE_INTRA,
-+ MODE_SKIP,
-+};
-+
-+enum InterPredIdc {
-+ PRED_L0 = 0,
-+ PRED_L1,
-+ PRED_BI,
-+};
-+
-+enum PredFlag {
-+ PF_INTRA = 0,
-+ PF_L0,
-+ PF_L1,
-+ PF_BI,
-+};
-+
-+enum SAOType {
-+ SAO_NOT_APPLIED = 0,
-+ SAO_BAND,
-+ SAO_EDGE,
-+ SAO_APPLIED
-+};
-+
-+enum SAOEOClass {
-+ SAO_EO_HORIZ = 0,
-+ SAO_EO_VERT,
-+ SAO_EO_135D,
-+ SAO_EO_45D,
-+};
-+
-+enum ScanType {
-+ SCAN_DIAG = 0,
-+ SCAN_HORIZ,
-+ SCAN_VERT,
-+};
-+
-+typedef struct RefPicList {
-+ struct HEVCRpiFrame *ref[HEVC_MAX_REFS];
-+ int list[HEVC_MAX_REFS];
-+ uint8_t isLongTerm[HEVC_MAX_REFS];
-+ int nb_refs;
-+} RefPicList;
-+
-+typedef struct RefPicListTab {
-+ RefPicList refPicList[2];
-+} RefPicListTab;
-+
-+typedef struct RpiCodingUnit {
-+ unsigned int x; // Passed to deblock
-+ unsigned int y;
-+ unsigned int x_split;
-+ unsigned int y_split;
-+
-+ enum PredMode pred_mode; ///< PredMode
-+ enum PartMode part_mode; ///< PartMode
-+
-+ // Inferred parameters
-+ uint8_t intra_split_flag; ///< IntraSplitFlag
-+ uint8_t max_trafo_depth; ///< MaxTrafoDepth
-+ uint8_t cu_transquant_bypass_flag;
-+} RpiCodingUnit;
-+
-+typedef struct RpiPredictionUnit {
-+ uint8_t intra_pred_mode[4];
-+ uint8_t intra_pred_mode_c[4];
-+ uint8_t chroma_mode_c[4];
-+ uint8_t merge_flag;
-+} RpiPredictionUnit;
-+
-+typedef struct HEVCRpiTransformUnit {
-+ int8_t cu_qp_delta;
-+
-+ // Inferred parameters;
-+ uint8_t intra_pred_mode;
-+ uint8_t intra_pred_mode_c;
-+ uint8_t chroma_mode_c;
-+ uint8_t is_cu_qp_delta_wanted;
-+ uint8_t cu_chroma_qp_offset_wanted;
-+ const int8_t * qp_divmod6[3];
-+} HEVCRpiTransformUnit;
-+
-+typedef struct DBParams {
-+ int8_t beta_offset; // -12 to +12
-+ int8_t tc_offset; // -12 to +12
-+} DBParams;
-+
-+#define HEVC_FRAME_FLAG_OUTPUT (1 << 0)
-+#define HEVC_FRAME_FLAG_SHORT_REF (1 << 1)
-+#define HEVC_FRAME_FLAG_LONG_REF (1 << 2)
-+#define HEVC_FRAME_FLAG_BUMPING (1 << 3)
-+
-+struct HEVCRpiJob;
-+
-+typedef struct HEVCRpiFrame {
-+ AVFrame *frame;
-+ ThreadFrame tf;
-+ ColMvField *col_mvf;
-+ int poc;
-+ struct HEVCRpiFrame *collocated_ref;
-+
-+ AVBufferRef *col_mvf_buf;
-+
-+ /**
-+ * A sequence counter, so that old frames are output first
-+ * after a POC reset
-+ */
-+ uint16_t sequence;
-+
-+ /**
-+ * A combination of HEVC_FRAME_FLAG_*
-+ */
-+ uint8_t flags;
-+
-+ // Entry no in DPB - can be used as a small unique
-+ // frame identifier (within the current thread)
-+ uint8_t dpb_no;
-+} HEVCRpiFrame;
-+
-+typedef struct HEVCRpiLocalContext {
-+ HEVCRpiTransformUnit tu;
-+
-+ CABACContext cc;
-+
-+ // Vars that allow us to locate everything from just an lc
-+ struct HEVCRpiContext * context; // ??? make const ???
-+ unsigned int lc_n; // lc list el no
-+
-+ // Job wait links
-+ struct HEVCRpiLocalContext * jw_next;
-+ struct HEVCRpiLocalContext * jw_prev;
-+ struct HEVCRpiLocalContext * ljw_next;
-+ struct HEVCRpiLocalContext * ljw_prev;
-+ struct HEVCRpiJob * volatile jw_job;
-+ sem_t jw_sem;
-+
-+ // ?? Wrap in structure ??
-+ sem_t bt_sem_in;
-+ sem_t * bt_psem_out;
-+ volatile int bt_terminate;
-+ unsigned int ts;
-+ unsigned int bt_last_line; // Last line in this bit_thread chunk
-+ unsigned int bt_line_no;
-+ unsigned int bt_line_width;
-+ unsigned int bt_line_inc;
-+
-+ struct HEVCRpiJob * jb0;
-+ char unit_done; // Set once we have dealt with this slice
-+ char bt_is_tile;
-+ char last_progress_good;
-+ char cabac_init_req;
-+
-+ uint8_t cabac_state[HEVC_CONTEXTS];
-+ uint8_t stat_coeff[4];
-+ GetBitContext gb;
-+
-+ uint8_t ct_depth;
-+ int8_t qp_y;
-+ int8_t curr_qp_y;
-+ int8_t qPy_pred;
-+
-+// N.B. Used by asm (neon) - do not change
-+#define AVAIL_S_UR 0
-+#define AVAIL_S_U 1
-+#define AVAIL_S_UL 2
-+#define AVAIL_S_L 3
-+#define AVAIL_S_DL 4
-+
-+#define AVAIL_U (1 << AVAIL_S_U)
-+#define AVAIL_L (1 << AVAIL_S_L)
-+#define AVAIL_UL (1 << AVAIL_S_UL)
-+#define AVAIL_UR (1 << AVAIL_S_UR)
-+#define AVAIL_DL (1 << AVAIL_S_DL)
-+
-+ uint8_t ctb_avail;
-+ int end_of_ctb_x;
-+ int end_of_ctb_y;
-+
-+ RpiCodingUnit cu;
-+ RpiPredictionUnit pu;
-+
-+#define BOUNDARY_LEFT_SLICE (1 << 0)
-+#define BOUNDARY_LEFT_TILE (1 << 1)
-+#define BOUNDARY_UPPER_SLICE (1 << 2)
-+#define BOUNDARY_UPPER_TILE (1 << 3)
-+ /* properties of the boundary of the current CTB for the purposes
-+ * of the deblocking filter */
-+ unsigned int boundary_flags;
-+
-+#define IPM_TAB_SIZE (HEVC_MAX_CTB_SIZE >> LOG2_MIN_PU_SIZE)
-+ uint8_t ipm_left[IPM_TAB_SIZE];
-+ uint8_t ipm_up[IPM_TAB_SIZE];
-+
-+//#define MVF_STASH_WIDTH 128
-+#define MVF_STASH_WIDTH 64
-+#define MVF_STASH_HEIGHT 64
-+#define MVF_STASH_WIDTH_PU (MVF_STASH_WIDTH >> LOG2_MIN_PU_SIZE)
-+#define MVF_STASH_HEIGHT_PU (MVF_STASH_HEIGHT >> LOG2_MIN_PU_SIZE)
-+ HEVCRpiMvField mvf_ul[1];
-+ HEVCRpiMvField mvf_stash[MVF_STASH_WIDTH_PU * MVF_STASH_HEIGHT_PU];
-+
-+ /* +7 is for subpixel interpolation, *2 for high bit depths */
-+// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+ /* The extended size between the new edge emu buffer is abused by SAO */
-+// DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer2)[(MAX_PB_SIZE + 7) * EDGE_EMU_BUFFER_STRIDE * 2];
-+// DECLARE_ALIGNED(32, int16_t, tmp [MAX_PB_SIZE * MAX_PB_SIZE]);
-+
-+} HEVCRpiLocalContext;
-+
-+// Each block can have an intra prediction and an add_residual command
-+// noof-cmds(2) * max-ctu height(64) / min-transform(4) * planes(3) * MAX_WIDTH
-+
-+// Sand only has 2 planes (Y/C)
-+#define RPI_MAX_PRED_CMDS (2*(HEVC_MAX_CTB_SIZE/4)*2*(HEVC_RPI_MAX_WIDTH/4))
-+
-+// Command for intra prediction and transform_add of predictions to coefficients
-+enum rpi_pred_cmd_e
-+{
-+ RPI_PRED_ADD_RESIDUAL,
-+ RPI_PRED_ADD_RESIDUAL_U, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+ RPI_PRED_ADD_RESIDUAL_V, // = RPI_PRED_TRANSFORM_ADD + c_idx
-+ RPI_PRED_ADD_RESIDUAL_C, // Merged U+V
-+ RPI_PRED_ADD_DC,
-+ RPI_PRED_ADD_DC_U, // Both U & V are effectively C
-+ RPI_PRED_ADD_DC_V,
-+ RPI_PRED_INTRA,
-+ RPI_PRED_INTRA_C,
-+ RPI_PRED_I_PCM,
-+ RPI_PRED_CMD_MAX
-+};
-+
-+typedef struct HEVCPredCmd {
-+ uint8_t type;
-+ uint8_t size; // log2 "size" used by all variants
-+ uint8_t avail; // i_pred - but left here as they pack well
-+ uint8_t dummy;
-+ union {
-+ struct { // TRANSFORM_ADD
-+ uint8_t * dst;
-+ const int16_t * buf;
-+ uint16_t stride; // Should be good enough for all pic fmts we use
-+ int16_t dc;
-+ } ta;
-+ struct {
-+ uint8_t * dst;
-+ uint32_t stride;
-+ int dc;
-+ } dc;
-+ struct { // INTRA
-+ uint16_t x;
-+ uint16_t y;
-+ enum IntraPredMode mode;
-+ } i_pred;
-+ struct { // I_PCM
-+ uint16_t x;
-+ uint16_t y;
-+ const void * src;
-+ uint32_t src_len;
-+ } i_pcm;
-+ };
-+} HEVCPredCmd;
-+
-+union qpu_mc_pred_cmd_s;
-+struct qpu_mc_pred_y_p_s;
-+struct qpu_mc_src_s;
-+
-+typedef struct HEVCRpiInterPredQ
-+{
-+ union qpu_mc_pred_cmd_u *qpu_mc_base;
-+ union qpu_mc_pred_cmd_u *qpu_mc_curr;
-+ struct qpu_mc_src_s *last_l0;
-+ struct qpu_mc_src_s *last_l1;
-+ unsigned int load;
-+ uint32_t code_setup;
-+ uint32_t code_sync;
-+ uint32_t code_exit;
-+} HEVCRpiInterPredQ;
-+
-+typedef struct HEVCRpiInterPredEnv
-+{
-+ HEVCRpiInterPredQ * q;
-+ uint8_t n; // Number of Qs
-+ uint8_t n_grp; // Number of Q in a group
-+ uint8_t curr; // Current Q number (0..n-1)
-+ uint8_t used; // 0 if nothing in any Q, 1 otherwise
-+ uint8_t used_grp; // 0 if nothing in any Q in the current group
-+ unsigned int max_fill;
-+ unsigned int min_gap;
-+ GPU_MEM_PTR_T gptr;
-+} HEVCRpiInterPredEnv;
-+
-+typedef struct HEVCRpiIntraPredEnv {
-+ unsigned int n; // Number of commands
-+ HEVCPredCmd * cmds;
-+} HEVCRpiIntraPredEnv;
-+
-+typedef struct HEVCRpiCoeffEnv {
-+ unsigned int n;
-+#if RPI_COMPRESS_COEFFS
-+ unsigned int packed; // Equal to 1 if coefficients should be being packed
-+ unsigned int packed_n; // Value of n when packed was set equal to 0 (i.e. the amount that is sent compressed). Only valid if packed==0
-+#endif
-+ int16_t * buf;
-+} HEVCRpiCoeffEnv;
-+
-+typedef struct HEVCRpiCoeffsEnv {
-+ HEVCRpiCoeffEnv s[4];
-+ GPU_MEM_PTR_T gptr;
-+ void * mptr;
-+} HEVCRpiCoeffsEnv;
-+
-+typedef struct HEVCRpiFrameProgressWait {
-+ int req;
-+ struct HEVCRpiFrameProgressWait * next;
-+ sem_t sem;
-+} HEVCRpiFrameProgressWait;
-+
-+typedef struct HEVCRpiFrameProgressState {
-+ struct HEVCRpiFrameProgressWait * first;
-+ struct HEVCRpiFrameProgressWait * last;
-+ pthread_mutex_t lock;
-+} HEVCRpiFrameProgressState;
-+
-+typedef struct RpiBlk
-+{
-+ unsigned int x;
-+ unsigned int y;
-+ unsigned int w;
-+ unsigned int h;
-+} RpiBlk;
-+
-+typedef struct HEVCRpiJob {
-+ struct HEVCRpiJob * next; // Free chain
-+ struct HEVCRpiJobCtl * jbc_local;
-+ const HEVCRpiSPS * sps; // sps used to set up this job
-+
-+ int waited;
-+ int ctu_ts_first;
-+ int ctu_ts_last;
-+ RpiBlk bounds; // Bounding box of job
-+
-+ struct qpu_mc_pred_y_p_s * last_y8_p;
-+ struct qpu_mc_src_s * last_y8_l1;
-+ rpi_cache_flush_env_t * rfe;
-+
-+ HEVCRpiInterPredEnv chroma_ip;
-+ HEVCRpiInterPredEnv luma_ip;
-+ int16_t progress_req[HEVC_DPB_ELS]; // index by dpb_no
-+ HEVCRpiIntraPredEnv intra;
-+ HEVCRpiCoeffsEnv coeffs;
-+ HEVCRpiFrameProgressWait progress_wait;
-+ sem_t sem;
-+ rpi_cache_buf_t flush_buf;
-+} HEVCRpiJob;
-+
-+struct HEVCRpiContext;
-+
-+typedef void HEVCRpiWorkerFn(const struct HEVCRpiContext * const s, HEVCRpiJob * const jb);
-+
-+typedef struct HEVCRpiPassQueue
-+{
-+// int pending;
-+ volatile int terminate;
-+ sem_t sem_in;
-+ sem_t * psem_out;
-+ unsigned int job_n;
-+ struct HEVCRpiContext * context; // Context pointer as we get to pass a single "void * this" to the thread
-+ HEVCRpiWorkerFn * worker;
-+ pthread_t thread;
-+ uint8_t pass_n; // Pass number - debug
-+ uint8_t started;
-+} HEVCRpiPassQueue;
-+
-+
-+struct HEVCRpiJobGlobal;
-+
-+typedef struct HEVCRpiJobCtl
-+{
-+ sem_t sem_out;
-+
-+ HEVCRpiJob * volatile jb1; // The job associated with this frame if unallocated - NULL if allocated
-+ struct HEVCRpiJobGlobal * jbg;
-+
-+ HEVCRpiLocalContext * lcw_head;
-+ HEVCRpiLocalContext * lcw_tail;
-+
-+ pthread_mutex_t in_lock;
-+ int offload_in;
-+
-+ HEVCRpiJob *offloadq[RPI_MAX_JOBS];
-+} HEVCRpiJobCtl;
-+
-+
-+typedef struct HEVCRpiJobGlobal
-+{
-+ intptr_t ref_count;
-+ pthread_mutex_t lock;
-+ HEVCRpiJob * free1; // Singly linked list of free jobs
-+ HEVCRpiLocalContext * wait_head; // Double linked list of lcs waiting for a job
-+ HEVCRpiLocalContext * wait_good; // Last good tail
-+ HEVCRpiLocalContext * wait_tail;
-+
-+} HEVCRpiJobGlobal;
-+
-+#define RPI_BIT_THREADS (RPI_EXTRA_BIT_THREADS + 1)
-+
-+#if RPI_TSTATS
-+typedef struct HEVCRpiStats {
-+ int y_pred1_y8_merge;
-+ int y_pred1_xy;
-+ int y_pred1_x0;
-+ int y_pred1_y0;
-+ int y_pred1_x0y0;
-+ int y_pred1_wle8;
-+ int y_pred1_wgt8;
-+ int y_pred1_hle16;
-+ int y_pred1_hgt16;
-+ int y_pred2_xy;
-+ int y_pred2_x0;
-+ int y_pred2_y0;
-+ int y_pred2_x0y0;
-+ int y_pred2_hle16;
-+ int y_pred2_hgt16;
-+} HEVCRpiStats;
-+#endif
-+
-+typedef struct HEVCRpiCabacState
-+{
-+ uint8_t rice[4];
-+ uint8_t state[HEVC_CONTEXTS];
-+} HEVCRpiCabacState;
-+
-+#define HEVC_RPI_BS_STRIDE1_PEL_SHIFT 6 // 64 pels
-+#define HEVC_RPI_BS_STRIDE1_PELS (1U << HEVC_RPI_BS_STRIDE1_PEL_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_PEL_MASK (HEVC_RPI_BS_STRIDE1_PELS - 1)
-+#define HEVC_RPI_BS_ELS_PER_BYTE_SHIFT 2 // 4 els per byte
-+#define HEVC_RPI_BS_PELS_PER_EL_SHIFT 2 // 4 pels per el
-+#define HEVC_RPI_BS_PELS_PER_BYTE_SHIFT (HEVC_RPI_BS_PELS_PER_EL_SHIFT + HEVC_RPI_BS_ELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTE_SHIFT (HEVC_RPI_BS_STRIDE1_PEL_SHIFT - HEVC_RPI_BS_PELS_PER_BYTE_SHIFT)
-+#define HEVC_RPI_BS_STRIDE1_BYTES (1U << HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+#define HEVC_RPI_BS_Y_SHR 3 // 8 vertical pels per row
-+#define HEVC_RPI_BS_COL_BYTES_SHR (HEVC_RPI_BS_Y_SHR - HEVC_RPI_BS_STRIDE1_BYTE_SHIFT)
-+
-+typedef struct HEVCRpiContext {
-+ const AVClass *c; // needed by private avoptions
-+ AVCodecContext *avctx;
-+
-+ uint8_t threads_type;
-+
-+ /** 1 if the independent slice segment header was successfully parsed */
-+ uint8_t slice_initialized;
-+ char used_for_ref; // rpi
-+ char is_irap;
-+ char offload_recon;
-+ uint8_t eos; ///< current packet contains an EOS/EOB NAL
-+ uint8_t last_eos; ///< last packet contains an EOS/EOB NAL
-+ uint8_t no_backward_pred_flag;
-+ uint8_t is_decoded;
-+ uint8_t no_rasl_output_flag;
-+
-+
-+ /**
-+ * Sequence counters for decoded and output frames, so that old
-+ * frames are output first after a POC reset
-+ */
-+ uint16_t seq_decode;
-+ uint16_t seq_output;
-+
-+ int width;
-+ int height;
-+
-+ HEVCRpiJobCtl * jbc;
-+ // cabac stash
-+ // b0 skip flag
-+ // b1+ ct_depth
-+ uint8_t * cabac_stash_left;
-+ uint8_t * cabac_stash_up;
-+
-+ // Function pointers
-+#if RPI_QPU_EMU_Y || RPI_QPU_EMU_C
-+ const uint8_t * qpu_dummy_frame_emu;
-+#endif
-+#if !RPI_QPU_EMU_Y || !RPI_QPU_EMU_C
-+ uint32_t qpu_dummy_frame_qpu; // Not a frame - just a bit of memory
-+#endif
-+ HEVCRpiQpu qpu;
-+
-+ HEVCRpiFrameProgressState progress_states[2];
-+
-+ HEVCRpiCabacState *cabac_save;
-+
-+ AVFrame *frame;
-+ AVFrame *output_frame;
-+ uint8_t *sao_pixel_buffer_h[3];
-+ uint8_t *sao_pixel_buffer_v[3];
-+
-+ unsigned int col_mvf_stride;
-+ AVBufferPool *col_mvf_pool;
-+
-+ RpiSAOParams *sao;
-+ DBParams *deblock;
-+ enum HEVCNALUnitType nal_unit_type;
-+ int temporal_id; ///< temporal_id_plus1 - 1
-+ HEVCRpiFrame *ref;
-+ int poc;
-+ int pocTid0;
-+ int slice_idx; ///< number of the slice being currently decoded
-+ int max_ra;
-+
-+ int8_t *qp_y_tab;
-+
-+ // Deblocking block strength bitmaps
-+ unsigned int bs_stride2;
-+ unsigned int bs_size;
-+ uint8_t *bs_horizontal;
-+ uint8_t *bs_vertical;
-+ uint8_t *bsf_stash_up;
-+ uint8_t *bsf_stash_left;
-+
-+#if HEVC_RPI_MAX_CTBS >= 0xffff
-+#define TAB_SLICE_ADDR_BROKEN ~(uint32_t)0
-+ uint32_t *tab_slice_address;
-+#else
-+#define TAB_SLICE_ADDR_BROKEN ~(uint16_t)0
-+ uint16_t *tab_slice_address;
-+#endif
-+
-+ // Bitfield 1 bit per 8 pels (min pcm size)
-+ uint8_t *is_pcm;
-+ // Bitfield 1 bit per 8 pels (min cb size)
-+ // Only needed for CIP as CIP processing is async to the main thread
-+ uint8_t *is_intra;
-+
-+ // PU
-+ HEVCRpiMvField *mvf_up;
-+ HEVCRpiMvField *mvf_left;
-+
-+ const RefPicList **rpl_up;
-+ const RefPicList **rpl_left;
-+ RefPicList * refPicList;
-+
-+ // CTB-level flags affecting loop filter operation
-+ uint8_t *filter_slice_edges;
-+
-+ /** used on BE to byteswap the lines for checksumming */
-+ uint8_t *checksum_buf;
-+ int checksum_buf_size;
-+
-+ atomic_int wpp_err;
-+
-+ const uint8_t *data;
-+
-+ H2645Packet pkt;
-+ // type of the first VCL NAL of the current frame
-+ enum HEVCNALUnitType first_nal_type;
-+
-+ uint8_t context_initialized;
-+ int is_nalff; ///< this flag is != 0 if bitstream is encapsulated
-+ ///< as a format defined in 14496-15
-+ int apply_defdispwin;
-+
-+ int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
-+ int nuh_layer_id;
-+
-+ struct AVMD5 *md5_ctx;
-+
-+ RefPicListTab * rpl_tab;
-+ unsigned int rpl_tab_size;
-+
-+ uint8_t *is_intra_store;
-+
-+ RpiSliceHeader sh;
-+
-+ HEVCRpiParamSets ps;
-+
-+ HEVCRpiLocalContext *HEVClc;
-+ HEVCRpiLocalContext *HEVClcList[MAX_NB_THREADS];
-+
-+ HEVCRpiFrame DPB[HEVC_DPB_ELS];
-+
-+ ///< candidate references for the current frame
-+ RefPicList rps[5];
-+
-+ HEVCRpiPredContext hpc;
-+ HEVCDSPContext hevcdsp;
-+
-+ HEVCSEIContext sei;
-+
-+ // Put structures that allocate non-trivial storage at the end
-+ // These are mostly used indirectly so position in the structure doesn't matter
-+ HEVCRpiPassQueue passq[RPI_PASSES];
-+#if RPI_EXTRA_BIT_THREADS > 0
-+ int bt_started;
-+ // This simply contains thread descriptors - task setup is held elsewhere
-+ pthread_t bit_threads[RPI_EXTRA_BIT_THREADS];
-+#endif
-+#if RPI_TSTATS
-+ HEVCRpiStats tstats;
-+#endif
-+} HEVCRpiContext;
-+
-+/**
-+ * Mark all frames in DPB as unused for reference.
-+ */
-+void ff_hevc_rpi_clear_refs(HEVCRpiContext *s);
-+
-+/**
-+ * Drop all frames currently in DPB.
-+ */
-+void ff_hevc_rpi_flush_dpb(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture sets for the current frame.
-+ */
-+int ff_hevc_rpi_frame_rps(HEVCRpiContext *s);
-+
-+/**
-+ * Construct the reference picture list(s) for the current slice.
-+ */
-+int ff_hevc_rpi_slice_rpl(HEVCRpiContext *s);
-+
-+
-+/**
-+ * Get the number of candidate references for the current frame.
-+ */
-+int ff_hevc_rpi_frame_nb_refs(HEVCRpiContext *s);
-+
-+int ff_hevc_rpi_set_new_ref(HEVCRpiContext *s, AVFrame **frame, int poc);
-+
-+/**
-+ * Find next frame in output order and put a reference to it in frame.
-+ * @return 1 if a frame was output, 0 otherwise
-+ */
-+int ff_hevc_rpi_output_frame(HEVCRpiContext *s, AVFrame *frame, int flush);
-+
-+void ff_hevc_rpi_bump_frame(HEVCRpiContext *s);
-+
-+void ff_hevc_rpi_unref_frame(HEVCRpiContext *s, HEVCRpiFrame *frame, int flags);
-+
-+unsigned int ff_hevc_rpi_tb_avail_flags(
-+ const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x, const unsigned int y, const unsigned int w, const unsigned int h);
-+
-+void ff_hevc_rpi_luma_mv_merge_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int x0, int y0, int nPbW,
-+ int nPbH, int log2_cb_size, int part_idx,
-+ int merge_idx, HEVCRpiMvField * const mv);
-+void ff_hevc_rpi_luma_mv_mvp_mode(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int nPbW, const unsigned int nPbH,
-+ const unsigned int avail,
-+ HEVCRpiMvField * const mv,
-+ const unsigned int mvp_lx_flag, const unsigned int LX);
-+void ff_hevc_rpi_set_qPy(const HEVCRpiContext * const s, HEVCRpiLocalContext * const lc, int xBase, int yBase);
-+void ff_hevc_rpi_deblocking_boundary_strengths(const HEVCRpiContext * const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int log2_trafo_size, const int is_coded_block);
-+int ff_hevc_rpi_hls_filter_blk(const HEVCRpiContext * const s, const RpiBlk bounds, const int eot);
-+
-+extern const uint8_t ff_hevc_rpi_qpel_extra_before[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra_after[4];
-+extern const uint8_t ff_hevc_rpi_qpel_extra[4];
-+
-+int16_t * rpi_alloc_coeff_buf(HEVCRpiJob * const jb, const int buf_no, const int n);
-+
-+// arm/hevc_misc_neon.S
-+// Neon coeff zap fn
-+#if HAVE_NEON
-+extern void rpi_zap_coeff_vals_neon(int16_t * dst, unsigned int l2ts_m2);
-+#endif
-+
-+void ff_hevc_rpi_progress_wait_field(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const HEVCRpiFrame * const ref, const int val, const int field);
-+
-+void ff_hevc_rpi_progress_signal_field(HEVCRpiContext * const s, const int val, const int field);
-+
-+// All of these expect that s->threads_type == FF_THREAD_FRAME
-+
-+static inline void ff_hevc_rpi_progress_wait_mv(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const HEVCRpiFrame * const ref, const int y)
-+{
-+ if (s->threads_type != 0)
-+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_mv(HEVCRpiContext * const s, const int y)
-+{
-+ if (s->used_for_ref && s->threads_type != 0)
-+ ff_hevc_rpi_progress_signal_field(s, y, 1);
-+}
-+
-+static inline void ff_hevc_rpi_progress_wait_recon(const HEVCRpiContext * const s, HEVCRpiJob * const jb,
-+ const HEVCRpiFrame * const ref, const int y)
-+{
-+ ff_hevc_rpi_progress_wait_field(s, jb, ref, y, 0);
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_recon(HEVCRpiContext * const s, const int y)
-+{
-+ if (s->used_for_ref && s->threads_type != 0)
-+ {
-+ ff_hevc_rpi_progress_signal_field(s, y, 0);
-+ }
-+}
-+
-+static inline void ff_hevc_rpi_progress_signal_all_done(HEVCRpiContext * const s)
-+{
-+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 0);
-+ ff_hevc_rpi_progress_signal_field(s, INT_MAX, 1);
-+}
-+
-+
-+// Set all done - signal nothing (used in missing refs)
-+// Works for both rpi & non-rpi
-+static inline void ff_hevc_rpi_progress_set_all_done(HEVCRpiFrame * const ref)
-+{
-+ if (ref->tf.progress != NULL)
-+ {
-+ int * const p = (int *)ref->tf.progress->data;
-+ p[0] = INT_MAX;
-+ p[1] = INT_MAX;
-+ }
-+}
-+
-+#define HEVC_RPI_420_ONLY 1
-+#define HEVC_RPI_SAND128_ONLY 1
-+
-+static inline unsigned int ctx_hshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+ return cidx == 0 ? 0 : 1;
-+#else
-+ return s->ps.sps->hshift[cidx];
-+#endif
-+}
-+
-+static inline unsigned int ctx_vshift(const HEVCRpiContext * const s, const int cidx)
-+{
-+#if HEVC_RPI_420_ONLY
-+ return cidx == 0 ? 0 : 1;
-+#else
-+ return s->ps.sps->vshift[cidx];
-+#endif
-+}
-+
-+static inline int ctx_cfmt(const HEVCRpiContext * const s)
-+{
-+#if HEVC_RPI_420_ONLY
-+ return 1;
-+#else
-+ return s->ps.sps->chroma_format_idc;
-+#endif
-+}
-+
-+static inline int frame_stride1(const AVFrame * const frame, const int c_idx)
-+{
-+#if HEVC_RPI_SAND128_ONLY
-+ return 128;
-+#else
-+ return frame->linesize[c_idx];
-+#endif
-+}
-+
-+#if HEVC_RPI_SAND128_ONLY
-+// Propagate this decision to later zc includes
-+#define RPI_ZC_SAND128_ONLY 1
-+#endif
-+
-+#ifndef ff_hevc_rpi_copy_vert
-+static inline void ff_hevc_rpi_copy_vert(uint8_t *dst, const uint8_t *src,
-+ int pixel_shift, int height,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src)
-+{
-+ int i;
-+ switch (pixel_shift)
-+ {
-+ case 2:
-+ for (i = 0; i < height; i++) {
-+ *(uint32_t *)dst = *(uint32_t *)src;
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ break;
-+ case 1:
-+ for (i = 0; i < height; i++) {
-+ *(uint16_t *)dst = *(uint16_t *)src;
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ break;
-+ default:
-+ for (i = 0; i < height; i++) {
-+ *dst = *src;
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ break;
-+ }
-+}
-+#endif
-+
-+
-+#if MVF_STASH_WIDTH == 64
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x, const unsigned int y)
-+{
-+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int x, const unsigned int y)
-+{
-+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+ const unsigned int x0_ctb = x0 & mask_cs_hi;
-+ const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+ return (HEVCRpiMvField *)((y < y0_ctb) ?
-+ (x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE)) :
-+ (x < x0_ctb ? s->mvf_left + (y >> LOG2_MIN_PU_SIZE) :
-+ lc->mvf_stash +
-+ ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU +
-+ ((x & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE)));
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+ const unsigned int x0,
-+ const unsigned int x)
-+{
-+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+ const unsigned int x0_ctb = x0 & mask_cs_hi;
-+ return x < x0_ctb ? 1 : MVF_STASH_WIDTH_PU;
-+}
-+
-+#else
-+static inline HEVCRpiMvField* mvf_stash_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x, const unsigned int y)
-+{
-+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+ return (HEVCRpiMvField*)(lc->mvf_stash + ((y & ~mask_cs_hi) >> LOG2_MIN_PU_SIZE) * MVF_STASH_WIDTH_PU + ((x >> LOG2_MIN_PU_SIZE) & (MVF_STASH_WIDTH_PU - 1)));
-+}
-+
-+static inline HEVCRpiMvField* mvf_ptr(const HEVCRpiContext *const s, const HEVCRpiLocalContext * const lc,
-+ const unsigned int x0, const unsigned int y0,
-+ const unsigned int x, const unsigned int y)
-+{
-+ const unsigned int mask_cs_hi = (~0U << s->ps.sps->log2_ctb_size);
-+
-+ const unsigned int x0_ctb = x0 & mask_cs_hi;
-+ const unsigned int y0_ctb = y0 & mask_cs_hi;
-+
-+ // If not in the same CTB for Y assume up
-+ if (y < y0_ctb) {
-+ // If not in the same CTB for X too assume up-left
-+ return (HEVCRpiMvField *)(x < x0_ctb ? lc->mvf_ul : s->mvf_up + (x >> LOG2_MIN_PU_SIZE));
-+ }
-+ return mvf_stash_ptr(s, lc, x, y);
-+}
-+
-+static inline unsigned int mvf_left_stride(const HEVCRpiContext *const s,
-+ const unsigned int x0,
-+ const unsigned int x)
-+{
-+ return MVF_STASH_WIDTH_PU;
-+}
-+#endif
-+
-+#endif /* AVCODEC_RPI_HEVCDEC_H */
-diff --git a/libavcodec/rpi_hevcdsp.c b/libavcodec/rpi_hevcdsp.c
-new file mode 100644
-index 0000000000..ac29789e7f
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.c
-@@ -0,0 +1,450 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ *
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdsp.h"
-+#include "rpi_hevc_mv.h"
-+
-+static const int8_t transform[32][32] = {
-+ { 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
-+ 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
-+ { 90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
-+ -4, -13, -22, -31, -38, -46, -54, -61, -67, -73, -78, -82, -85, -88, -90, -90 },
-+ { 90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90,
-+ -90, -87, -80, -70, -57, -43, -25, -9, 9, 25, 43, 57, 70, 80, 87, 90 },
-+ { 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
-+ 13, 38, 61, 78, 88, 90, 85, 73, 54, 31, 4, -22, -46, -67, -82, -90 },
-+ { 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89,
-+ 89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89 },
-+ { 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
-+ -22, -61, -85, -90, -73, -38, 4, 46, 78, 90, 82, 54, 13, -31, -67, -88 },
-+ { 87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87,
-+ -87, -57, -9, 43, 80, 90, 70, 25, -25, -70, -90, -80, -43, 9, 57, 87 },
-+ { 85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
-+ 31, 78, 90, 61, 4, -54, -88, -82, -38, 22, 73, 90, 67, 13, -46, -85 },
-+ { 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83,
-+ 83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83 },
-+ { 82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
-+ -38, -88, -73, -4, 67, 90, 46, -31, -85, -78, -13, 61, 90, 54, -22, -82 },
-+ { 80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80,
-+ -80, -9, 70, 87, 25, -57, -90, -43, 43, 90, 57, -25, -87, -70, 9, 80 },
-+ { 78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
-+ 46, 90, 38, -54, -90, -31, 61, 88, 22, -67, -85, -13, 73, 82, 4, -78 },
-+ { 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75,
-+ 75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75 },
-+ { 73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
-+ -54, -85, 4, 88, 46, -61, -82, 13, 90, 38, -67, -78, 22, 90, 31, -73 },
-+ { 70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70,
-+ -70, 43, 87, -9, -90, -25, 80, 57, -57, -80, 25, 90, 9, -87, -43, 70 },
-+ { 67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
-+ 61, 73, -46, -82, 31, 88, -13, -90, -4, 90, 22, -85, -38, 78, 54, -67 },
-+ { 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64,
-+ 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64 },
-+ { 61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
-+ -67, -54, 78, 38, -85, -22, 90, 4, -90, 13, 88, -31, -82, 46, 73, -61 },
-+ { 57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57,
-+ -57, 80, 25, -90, 9, 87, -43, -70, 70, 43, -87, -9, 90, -25, -80, 57 },
-+ { 54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
-+ 73, 31, -90, 22, 78, -67, -38, 90, -13, -82, 61, 46, -88, 4, 85, -54 },
-+ { 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50,
-+ 50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50 },
-+ { 46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
-+ -78, -4, 82, -73, -13, 85, -67, -22, 88, -61, -31, 90, -54, -38, 90, -46 },
-+ { 43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43,
-+ -43, 90, -57, -25, 87, -70, -9, 80, -80, 9, 70, -87, 25, 57, -90, 43 },
-+ { 38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
-+ 82, -22, -54, 90, -61, -13, 78, -85, 31, 46, -90, 67, 4, -73, 88, -38 },
-+ { 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36,
-+ 36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36 },
-+ { 31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
-+ -85, 46, 13, -67, 90, -73, 22, 38, -82, 88, -54, -4, 61, -90, 78, -31 },
-+ { 25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25,
-+ -25, 70, -90, 80, -43, -9, 57, -87, 87, -57, 9, 43, -80, 90, -70, 25 },
-+ { 22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
-+ 88, -67, 31, 13, -54, 82, -90, 78, -46, 4, 38, -73, 90, -85, 61, -22 },
-+ { 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18,
-+ 18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18 },
-+ { 13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
-+ -90, 82, -67, 46, -22, -4, 31, -54, 73, -85, 90, -88, 78, -61, 38, -13 },
-+ { 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9,
-+ -9, 25, -43, 57, -70, 80, -87, 90, -90, 87, -80, 70, -57, 43, -25, 9 },
-+ { 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90,
-+ 90, -90, 88, -85, 82, -78, 73, -67, 61, -54, 46, -38, 31, -22, 13, -4 },
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_epel_filters[7][4]) = {
-+ { -2, 58, 10, -2},
-+ { -4, 54, 16, -2},
-+ { -6, 46, 28, -4},
-+ { -4, 36, 36, -4},
-+ { -4, 28, 46, -6},
-+ { -2, 16, 54, -4},
-+ { -2, 10, 58, -2},
-+};
-+
-+DECLARE_ALIGNED(16, const int8_t, ff_hevc_rpi_qpel_filters[3][16]) = {
-+ { -1, 4,-10, 58, 17, -5, 1, 0, -1, 4,-10, 58, 17, -5, 1, 0},
-+ { -1, 4,-11, 40, 40,-11, 4, -1, -1, 4,-11, 40, 40,-11, 4, -1},
-+ { 0, 1, -5, 17, 58,-10, 4, -1, 0, 1, -5, 17, 58,-10, 4, -1}
-+};
-+
-+#define BIT_DEPTH 8
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcdsp_template.c"
-+#undef BIT_DEPTH
-+
-+static uint32_t hevc_deblocking_boundary_strengths(int pus, int dup, const HEVCRpiMvField *curr, const HEVCRpiMvField *neigh,
-+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ int in_inc0, int in_inc1)
-+{
-+ int shift = 32;
-+ uint32_t bs = 0;
-+ for (; pus > 0; pus--) {
-+ int strength, out;
-+ int curr_refL0 = curr_rpl0[curr->ref_idx[0]];
-+ int curr_refL1 = curr_rpl1[curr->ref_idx[1]];
-+ int nr_idx0 = neigh->ref_idx[0];
-+ int nr_idx1 = neigh->ref_idx[1];
-+ int neigh_refL0 = neigh_rpl0[nr_idx0];
-+ int neigh_refL1 = neigh_rpl1[nr_idx1];
-+
-+ av_assert0(nr_idx0 >= 0 && nr_idx0 <=31);
-+ av_assert0(nr_idx1 >= 0 && nr_idx1 <=31);
-+
-+#if 1 // This more directly matches the original implementation
-+ if (curr->pred_flag == PF_BI && neigh->pred_flag == PF_BI) {
-+ // same L0 and L1
-+ if (curr_refL0 == neigh_refL0 &&
-+ curr_refL0 == curr_refL1 &&
-+ neigh_refL0 == neigh_refL1) {
-+ if ((FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4) &&
-+ (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4))
-+ strength = 1;
-+ else
-+ strength = 0;
-+ } else if (neigh_refL0 == curr_refL0 &&
-+ neigh_refL1 == curr_refL1) {
-+ if (FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[0])) >= 4 ||
-+ FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[1])) >= 4)
-+ strength = 1;
-+ else
-+ strength = 0;
-+ } else if (neigh_refL1 == curr_refL0 &&
-+ neigh_refL0 == curr_refL1) {
-+ if (FFABS(MV_X(neigh->xy[1]) - MV_X(curr->xy[0])) >= 4 || FFABS(MV_Y(neigh->xy[1]) - MV_Y(curr->xy[0])) >= 4 ||
-+ FFABS(MV_X(neigh->xy[0]) - MV_X(curr->xy[1])) >= 4 || FFABS(MV_Y(neigh->xy[0]) - MV_Y(curr->xy[1])) >= 4)
-+ strength = 1;
-+ else
-+ strength = 0;
-+ } else {
-+ strength = 1;
-+ }
-+ } else if ((curr->pred_flag != PF_BI) && (neigh->pred_flag != PF_BI)){ // 1 MV
-+ MvXY curr_mv0, neigh_mv0;
-+
-+ if (curr->pred_flag & 1) {
-+ curr_mv0 = curr->xy[0];
-+ } else {
-+ curr_mv0 = curr->xy[1];
-+ curr_refL0 = curr_refL1;
-+ }
-+
-+ if (neigh->pred_flag & 1) {
-+ neigh_mv0 = neigh->xy[0];
-+ } else {
-+ neigh_mv0 = neigh->xy[1];
-+ neigh_refL0 = neigh_refL1;
-+ }
-+
-+ if (curr_refL0 == neigh_refL0) {
-+ if (FFABS(MV_X(curr_mv0) - MV_X(neigh_mv0)) >= 4 || FFABS(MV_Y(curr_mv0) - MV_Y(neigh_mv0)) >= 4)
-+ strength = 1;
-+ else
-+ strength = 0;
-+ } else
-+ strength = 1;
-+ } else
-+ strength = 1;
-+#else // This has exactly the same effect, but is more suitable for vectorisation
-+ MvXY curr_mv[2];
-+ MvXY neigh_mv[2];
-+ memcpy(curr_mv, curr->xy, sizeof curr_mv);
-+ memcpy(neigh_mv, neigh->xy, sizeof neigh_mv);
-+
-+ if (!(curr->pred_flag & 2)) {
-+ curr_mv[1] = curr_mv[0];
-+ curr_refL1 = curr_refL0;
-+ }
-+ if (!(neigh->pred_flag & 2)) {
-+ neigh_mv[1] = neigh_mv[0];
-+ neigh_refL1 = neigh_refL0;
-+ }
-+ if (!(curr->pred_flag & 1)) {
-+ curr_mv[0] = curr_mv[1];
-+ curr_refL0 = curr_refL1;
-+ }
-+ if (!(neigh->pred_flag & 1)) {
-+ neigh_mv[0] = neigh_mv[1];
-+ neigh_refL0 = neigh_refL1;
-+ }
-+
-+ strength = 1;
-+
-+ strength &= (neigh_refL0 != curr_refL0) | (neigh_refL1 != curr_refL1) |
-+ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[0])) >= 4) |
-+ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[1])) >= 4);
-+
-+ strength &= (neigh_refL1 != curr_refL0) | (neigh_refL0 != curr_refL1) |
-+ (FFABS(MV_X(neigh_mv[1]) - MV_X(curr_mv[0])) >= 4) | (FFABS(MV_Y(neigh_mv[1]) - MV_Y(curr_mv[0])) >= 4) |
-+ (FFABS(MV_X(neigh_mv[0]) - MV_X(curr_mv[1])) >= 4) | (FFABS(MV_Y(neigh_mv[0]) - MV_Y(curr_mv[1])) >= 4);
-+
-+ strength |= (((curr->pred_flag + 1) ^ (neigh->pred_flag + 1)) >> 2);
-+#endif
-+
-+ curr += in_inc0 / sizeof (HEVCRpiMvField);
-+ neigh += in_inc1 / sizeof (HEVCRpiMvField);
-+
-+ for (out = dup; out > 0; out--)
-+ {
-+ bs = (bs >> 2) | (strength << 30);
-+ shift -= 2;
-+ }
-+ }
-+ return bs >> shift;
-+}
-+
-+
-+static void cpy_blk(uint8_t *dst, unsigned int stride_dst, const uint8_t *src, unsigned stride_src, unsigned int width, unsigned int height)
-+{
-+ unsigned int i, j;
-+
-+ if (((intptr_t)dst | (intptr_t)src | stride_dst | stride_src) & 15) {
-+ for (i = 0; i < height; i++) {
-+ for (j = 0; j < width; j+=8)
-+ AV_COPY64U(dst+j, src+j);
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ } else {
-+ for (i = 0; i < height; i++) {
-+ for (j = 0; j < width; j+=16)
-+ AV_COPY128(dst+j, src+j);
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+ }
-+}
-+
-+
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hevcdsp, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef PEL_FUNC
-+#define PEL_FUNC(dst1, idx1, idx2, a, depth) \
-+ for(i = 0 ; i < 10 ; i++) \
-+{ \
-+ hevcdsp->dst1[i][idx1][idx2] = a ## _ ## depth; \
-+}
-+
-+#undef EPEL_FUNCS
-+#define EPEL_FUNCS(depth) \
-+ PEL_FUNC(put_hevc_epel, 0, 0, put_hevc_pel_pixels, depth); \
-+ PEL_FUNC(put_hevc_epel, 0, 1, put_hevc_epel_h, depth); \
-+ PEL_FUNC(put_hevc_epel, 1, 0, put_hevc_epel_v, depth); \
-+ PEL_FUNC(put_hevc_epel, 1, 1, put_hevc_epel_hv, depth)
-+
-+#undef EPEL_UNI_FUNCS
-+#define EPEL_UNI_FUNCS(depth) \
-+ PEL_FUNC(put_hevc_epel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
-+ PEL_FUNC(put_hevc_epel_uni, 0, 1, put_hevc_epel_uni_h, depth); \
-+ PEL_FUNC(put_hevc_epel_uni, 1, 0, put_hevc_epel_uni_v, depth); \
-+ PEL_FUNC(put_hevc_epel_uni, 1, 1, put_hevc_epel_uni_hv, depth); \
-+ PEL_FUNC(put_hevc_epel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
-+ PEL_FUNC(put_hevc_epel_uni_w, 0, 1, put_hevc_epel_uni_w_h, depth); \
-+ PEL_FUNC(put_hevc_epel_uni_w, 1, 0, put_hevc_epel_uni_w_v, depth); \
-+ PEL_FUNC(put_hevc_epel_uni_w, 1, 1, put_hevc_epel_uni_w_hv, depth)
-+
-+#undef EPEL_BI_FUNCS
-+#define EPEL_BI_FUNCS(depth) \
-+ PEL_FUNC(put_hevc_epel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
-+ PEL_FUNC(put_hevc_epel_bi, 0, 1, put_hevc_epel_bi_h, depth); \
-+ PEL_FUNC(put_hevc_epel_bi, 1, 0, put_hevc_epel_bi_v, depth); \
-+ PEL_FUNC(put_hevc_epel_bi, 1, 1, put_hevc_epel_bi_hv, depth); \
-+ PEL_FUNC(put_hevc_epel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
-+ PEL_FUNC(put_hevc_epel_bi_w, 0, 1, put_hevc_epel_bi_w_h, depth); \
-+ PEL_FUNC(put_hevc_epel_bi_w, 1, 0, put_hevc_epel_bi_w_v, depth); \
-+ PEL_FUNC(put_hevc_epel_bi_w, 1, 1, put_hevc_epel_bi_w_hv, depth)
-+
-+#undef QPEL_FUNCS
-+#define QPEL_FUNCS(depth) \
-+ PEL_FUNC(put_hevc_qpel, 0, 0, put_hevc_pel_pixels, depth); \
-+ PEL_FUNC(put_hevc_qpel, 0, 1, put_hevc_qpel_h, depth); \
-+ PEL_FUNC(put_hevc_qpel, 1, 0, put_hevc_qpel_v, depth); \
-+ PEL_FUNC(put_hevc_qpel, 1, 1, put_hevc_qpel_hv, depth)
-+
-+#undef QPEL_UNI_FUNCS
-+#define QPEL_UNI_FUNCS(depth) \
-+ PEL_FUNC(put_hevc_qpel_uni, 0, 0, put_hevc_pel_uni_pixels, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni, 0, 1, put_hevc_qpel_uni_h, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni, 1, 0, put_hevc_qpel_uni_v, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni, 1, 1, put_hevc_qpel_uni_hv, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni_w, 0, 0, put_hevc_pel_uni_w_pixels, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni_w, 0, 1, put_hevc_qpel_uni_w_h, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni_w, 1, 0, put_hevc_qpel_uni_w_v, depth); \
-+ PEL_FUNC(put_hevc_qpel_uni_w, 1, 1, put_hevc_qpel_uni_w_hv, depth)
-+
-+#undef QPEL_BI_FUNCS
-+#define QPEL_BI_FUNCS(depth) \
-+ PEL_FUNC(put_hevc_qpel_bi, 0, 0, put_hevc_pel_bi_pixels, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi, 0, 1, put_hevc_qpel_bi_h, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi, 1, 0, put_hevc_qpel_bi_v, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi, 1, 1, put_hevc_qpel_bi_hv, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi_w, 0, 0, put_hevc_pel_bi_w_pixels, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi_w, 0, 1, put_hevc_qpel_bi_w_h, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi_w, 1, 0, put_hevc_qpel_bi_w_v, depth); \
-+ PEL_FUNC(put_hevc_qpel_bi_w, 1, 1, put_hevc_qpel_bi_w_hv, depth)
-+
-+#define SLICED_ADD_RESIDUAL(depth)\
-+ hevcdsp->add_residual_u[0] = FUNC(add_residual4x4_u, depth); \
-+ hevcdsp->add_residual_u[1] = FUNC(add_residual8x8_u, depth); \
-+ hevcdsp->add_residual_u[2] = FUNC(add_residual16x16_u, depth); \
-+ hevcdsp->add_residual_u[3] = FUNC(add_residual32x32_u, depth); \
-+ hevcdsp->add_residual_v[0] = FUNC(add_residual4x4_v, depth); \
-+ hevcdsp->add_residual_v[1] = FUNC(add_residual8x8_v, depth); \
-+ hevcdsp->add_residual_v[2] = FUNC(add_residual16x16_v, depth); \
-+ hevcdsp->add_residual_v[3] = FUNC(add_residual32x32_v, depth); \
-+ hevcdsp->add_residual_c[0] = FUNC(add_residual4x4_c, depth); \
-+ hevcdsp->add_residual_c[1] = FUNC(add_residual8x8_c, depth); \
-+ hevcdsp->add_residual_c[2] = FUNC(add_residual16x16_c, depth); \
-+ hevcdsp->add_residual_c[3] = FUNC(add_residual32x32_c, depth); \
-+ hevcdsp->add_residual_dc_c[0] = FUNC(add_residual4x4_dc_c, depth); \
-+ hevcdsp->add_residual_dc_c[1] = FUNC(add_residual8x8_dc_c, depth); \
-+ hevcdsp->add_residual_dc_c[2] = FUNC(add_residual16x16_dc_c, depth); \
-+ hevcdsp->add_residual_dc_c[3] = FUNC(add_residual32x32_dc_c, depth); \
-+ hevcdsp->put_pcm_c = FUNC(put_pcm_c, depth)
-+#define SLICED_LOOP_FILTERS(depth)\
-+ hevcdsp->hevc_h_loop_filter_luma2 = FUNC(hevc_h_loop_filter_luma2, depth); \
-+ hevcdsp->hevc_v_loop_filter_luma2 = FUNC(hevc_v_loop_filter_luma2, depth); \
-+ hevcdsp->hevc_h_loop_filter_uv = FUNC(hevc_h_loop_filter_uv, depth); \
-+ hevcdsp->hevc_v_loop_filter_uv2 = FUNC(hevc_v_loop_filter_uv2, depth)
-+#define SLICED_SAO(depth)\
-+ for (i = 0; i != SAO_FILTER_N; ++i) { \
-+ hevcdsp->sao_band_filter_c[i] = FUNC(sao_band_filter_c, depth); \
-+ hevcdsp->sao_edge_filter_c[i] = FUNC(sao_edge_filter_c, depth); \
-+ } \
-+ hevcdsp->sao_edge_restore_c[0] = FUNC(sao_edge_restore_c_0, depth); \
-+ hevcdsp->sao_edge_restore_c[1] = FUNC(sao_edge_restore_c_1, depth)
-+
-+#define HEVC_DSP(depth) \
-+ hevcdsp->put_pcm = FUNC(put_pcm, depth); \
-+ hevcdsp->add_residual[0] = FUNC(add_residual4x4, depth); \
-+ hevcdsp->add_residual[1] = FUNC(add_residual8x8, depth); \
-+ hevcdsp->add_residual[2] = FUNC(add_residual16x16, depth); \
-+ hevcdsp->add_residual[3] = FUNC(add_residual32x32, depth); \
-+ hevcdsp->add_residual_dc[0] = FUNC(add_residual4x4_dc, depth); \
-+ hevcdsp->add_residual_dc[1] = FUNC(add_residual8x8_dc, depth); \
-+ hevcdsp->add_residual_dc[2] = FUNC(add_residual16x16_dc, depth); \
-+ hevcdsp->add_residual_dc[3] = FUNC(add_residual32x32_dc, depth); \
-+ SLICED_ADD_RESIDUAL(depth); \
-+ hevcdsp->dequant = FUNC(dequant, depth); \
-+ hevcdsp->transform_rdpcm = FUNC(transform_rdpcm, depth); \
-+ hevcdsp->transform_4x4_luma = FUNC(transform_4x4_luma, depth); \
-+ hevcdsp->idct[0] = FUNC(idct_4x4, depth); \
-+ hevcdsp->idct[1] = FUNC(idct_8x8, depth); \
-+ hevcdsp->idct[2] = FUNC(idct_16x16, depth); \
-+ hevcdsp->idct[3] = FUNC(idct_32x32, depth); \
-+ \
-+ hevcdsp->idct_dc[0] = FUNC(idct_4x4_dc, depth); \
-+ hevcdsp->idct_dc[1] = FUNC(idct_8x8_dc, depth); \
-+ hevcdsp->idct_dc[2] = FUNC(idct_16x16_dc, depth); \
-+ hevcdsp->idct_dc[3] = FUNC(idct_32x32_dc, depth); \
-+ \
-+ for (i = 0; i != SAO_FILTER_N; ++i) { \
-+ hevcdsp->sao_band_filter[i] = FUNC(sao_band_filter, depth); \
-+ hevcdsp->sao_edge_filter[i] = FUNC(sao_edge_filter, depth); \
-+ } \
-+ hevcdsp->sao_edge_restore[0] = FUNC(sao_edge_restore_0, depth); \
-+ hevcdsp->sao_edge_restore[1] = FUNC(sao_edge_restore_1, depth); \
-+ SLICED_SAO(depth); \
-+ \
-+ QPEL_FUNCS(depth); \
-+ QPEL_UNI_FUNCS(depth); \
-+ QPEL_BI_FUNCS(depth); \
-+ EPEL_FUNCS(depth); \
-+ EPEL_UNI_FUNCS(depth); \
-+ EPEL_BI_FUNCS(depth); \
-+ \
-+ SLICED_LOOP_FILTERS(depth); \
-+ hevcdsp->hevc_h_loop_filter_luma = FUNC(hevc_h_loop_filter_luma, depth); \
-+ hevcdsp->hevc_v_loop_filter_luma = FUNC(hevc_v_loop_filter_luma, depth); \
-+ hevcdsp->hevc_h_loop_filter_chroma = FUNC(hevc_h_loop_filter_chroma, depth); \
-+ hevcdsp->hevc_v_loop_filter_chroma = FUNC(hevc_v_loop_filter_chroma, depth); \
-+ hevcdsp->hevc_h_loop_filter_luma_c = FUNC(hevc_h_loop_filter_luma, depth); \
-+ hevcdsp->hevc_v_loop_filter_luma_c = FUNC(hevc_v_loop_filter_luma, depth); \
-+ hevcdsp->hevc_h_loop_filter_chroma_c = FUNC(hevc_h_loop_filter_chroma, depth); \
-+ hevcdsp->hevc_v_loop_filter_chroma_c = FUNC(hevc_v_loop_filter_chroma, depth)
-+int i = 0;
-+
-+ switch (bit_depth) {
-+ case 9:
-+ HEVC_DSP(9);
-+ break;
-+ case 10:
-+ HEVC_DSP(10);
-+ break;
-+ case 12:
-+ HEVC_DSP(12);
-+ break;
-+ default:
-+ HEVC_DSP(8);
-+ break;
-+ }
-+
-+ hevcdsp->hevc_deblocking_boundary_strengths = hevc_deblocking_boundary_strengths;
-+ hevcdsp->cpy_blk = cpy_blk;
-+
-+ if (ARCH_PPC)
-+ ff_hevc_rpi_dsp_init_ppc(hevcdsp, bit_depth);
-+ if (ARCH_X86)
-+ ff_hevc_rpi_dsp_init_x86(hevcdsp, bit_depth);
-+ if (ARCH_ARM)
-+ ff_hevcdsp_rpi_init_arm(hevcdsp, bit_depth);
-+ if (ARCH_MIPS)
-+ ff_hevc_rpi_dsp_init_mips(hevcdsp, bit_depth);
-+}
-diff --git a/libavcodec/rpi_hevcdsp.h b/libavcodec/rpi_hevcdsp.h
-new file mode 100644
-index 0000000000..5a7cdeeb66
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp.h
-@@ -0,0 +1,177 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ * Copyright (C) 2013 - 2014 Pierre-Edouard Lepere
-+ *
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCDSP_H
-+#define AVCODEC_RPI_HEVCDSP_H
-+
-+#include "hevc.h"
-+#include "get_bits.h"
-+
-+struct HEVCRpiMvField;
-+
-+#define MAX_PB_SIZE 64
-+
-+#define RPI_HEVC_SAO_BUF_STRIDE 160
-+
-+
-+typedef struct RpiSAOParams {
-+ uint8_t band_position[3]; ///< sao_band_position (Y,U,V)
-+ uint8_t eo_class[3]; ///< sao_eo_class (Y,U=V)
-+ uint8_t type_idx[3]; ///< sao_type_idx (Y,U=V)
-+
-+ int16_t offset_val[3][5]; ///<SaoOffsetVal (Y,U,V)
-+
-+} RpiSAOParams;
-+
-+
-+// This controls how many sao dsp functions there are
-+// N=5 has width = 8, 16, 32, 48, 64
-+// N=6 adds a function for width=24 (in fn array el 5 so existing code should
-+// still work)
-+#define SAO_FILTER_N 6
-+
-+
-+typedef struct HEVCDSPContext {
-+ void (*put_pcm)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+ struct GetBitContext *gb, int pcm_bit_depth);
-+
-+ void (*add_residual[4])(uint8_t *dst, int16_t *res, ptrdiff_t stride);
-+ void (*add_residual_dc[4])(uint8_t *dst, ptrdiff_t stride, int dc);
-+ void (*add_residual_u[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_v);
-+ void (*add_residual_v[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride, int dc_u);
-+
-+ void (*add_residual_c[4])(uint8_t *dst, const int16_t *res, ptrdiff_t stride);
-+ void (*add_residual_dc_c[4])(uint8_t *dst, ptrdiff_t stride, int32_t dc_uv);
-+ void (*put_pcm_c)(uint8_t *_dst, ptrdiff_t _stride, int width, int height,
-+ struct GetBitContext *gb, int pcm_bit_depth);
-+
-+ void (*dequant)(int16_t *coeffs, int16_t log2_size);
-+
-+ void (*transform_rdpcm)(int16_t *coeffs, int16_t log2_size, int mode);
-+
-+ void (*transform_4x4_luma)(int16_t *coeffs);
-+
-+ void (*idct[4])(int16_t *coeffs, int col_limit);
-+
-+ void (*idct_dc[4])(int16_t *coeffs);
-+
-+ void (*sao_band_filter[SAO_FILTER_N])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+ int16_t *sao_offset_val, int sao_left_class, int width, int height);
-+ void (*sao_band_filter_c[SAO_FILTER_N])(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height);
-+
-+ /* implicit stride_src parameter has value of 2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE */
-+ void (*sao_edge_filter[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+ int16_t *sao_offset_val, int sao_eo_class, int width, int height);
-+ void (*sao_edge_filter_c[SAO_FILTER_N])(uint8_t *_dst /* align 16 */, const uint8_t *_src /* align 32 */, ptrdiff_t stride_dst,
-+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v, int sao_eo_class, int width, int height);
-+
-+ void (*sao_edge_restore[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+ struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+ void (*sao_edge_restore_c[2])(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src,
-+ struct RpiSAOParams *sao, int *borders, int _width, int _height, int c_idx,
-+ uint8_t *vert_edge, uint8_t *horiz_edge, uint8_t *diag_edge);
-+
-+ void (*put_hevc_qpel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+ int height, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_qpel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, ptrdiff_t srcstride,
-+ int height, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_qpel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+
-+ void (*put_hevc_qpel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_qpel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_epel[10][2][2])(int16_t *dst, uint8_t *src, ptrdiff_t srcstride,
-+ int height, intptr_t mx, intptr_t my, int width);
-+
-+ void (*put_hevc_epel_uni[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_epel_uni_w[10][2][2])(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_epel_bi[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width);
-+ void (*put_hevc_epel_bi_w[10][2][2])(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int ox0, int wx1,
-+ int ox1, intptr_t mx, intptr_t my, int width);
-+
-+ void (*hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+ int beta, int32_t *tc,
-+ uint8_t *no_p, uint8_t *no_q);
-+ void (*hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+ int beta, int32_t *tc,
-+ uint8_t *no_p, uint8_t *no_q);
-+ void (*hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+ void (*hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+ int32_t *tc, uint8_t *no_p, uint8_t *no_q);
-+ void (*hevc_h_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+ int beta, int32_t *tc,
-+ uint8_t *no_p, uint8_t *no_q);
-+ void (*hevc_v_loop_filter_luma_c)(uint8_t *pix, ptrdiff_t stride,
-+ int beta, int32_t *tc,
-+ uint8_t *no_p, uint8_t *no_q);
-+ void (*hevc_h_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+ int32_t *tc, uint8_t *no_p,
-+ uint8_t *no_q);
-+ void (*hevc_v_loop_filter_chroma_c)(uint8_t *pix, ptrdiff_t stride,
-+ int32_t *tc, uint8_t *no_p,
-+ uint8_t *no_q);
-+ void (*hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f);
-+ void (*hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+ uint8_t * _pix_l);
-+ void (*hevc_h_loop_filter_uv)(uint8_t * src, unsigned int stride, uint32_t tc4,
-+ unsigned int no_f);
-+ void (*hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+ uint8_t * src_l,
-+ unsigned int no_f);
-+
-+ uint32_t (*hevc_deblocking_boundary_strengths)(int pus, int dup, const struct HEVCRpiMvField *curr, const struct HEVCRpiMvField *neigh,
-+ const int *curr_rpl0, const int *curr_rpl1, const int *neigh_rpl0, const int *neigh_rpl1,
-+ int in_inc0, int inc_inc1);
-+
-+ void (* cpy_blk)(uint8_t * dst, unsigned int dst_stride, const uint8_t * src, unsigned int src_stride, unsigned int width, unsigned int height);
-+} HEVCDSPContext;
-+
-+void ff_hevc_rpi_dsp_init(HEVCDSPContext *hpc, int bit_depth);
-+
-+extern const int8_t ff_hevc_rpi_epel_filters[7][4];
-+extern const int8_t ff_hevc_rpi_qpel_filters[3][16];
-+
-+void ff_hevc_rpi_dsp_init_ppc(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_x86(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevcdsp_rpi_init_arm(HEVCDSPContext *c, const int bit_depth);
-+void ff_hevc_rpi_dsp_init_mips(HEVCDSPContext *c, const int bit_depth);
-+#endif /* AVCODEC_RPI_HEVCDSP_H */
-diff --git a/libavcodec/rpi_hevcdsp_template.c b/libavcodec/rpi_hevcdsp_template.c
-new file mode 100644
-index 0000000000..d1196a4440
---- /dev/null
-+++ b/libavcodec/rpi_hevcdsp_template.c
-@@ -0,0 +1,2278 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "get_bits.h"
-+#include "rpi_hevcdec.h"
-+
-+#include "bit_depth_template.c"
-+#include "rpi_hevcdsp.h"
-+
-+#include "rpi_hevc_shader_template.h"
-+
-+static void FUNC(put_pcm)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+ GetBitContext *gb, int pcm_bit_depth)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+ dst += stride;
-+ }
-+}
-+
-+static void FUNC(put_pcm_c)(uint8_t *_dst, ptrdiff_t stride, int width, int height,
-+ GetBitContext *gb, int pcm_bit_depth)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+ dst += stride;
-+ }
-+
-+ dst = (pixel *)_dst + 1;
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x*2] = get_bits(gb, pcm_bit_depth) << (BIT_DEPTH - pcm_bit_depth);
-+ dst += stride;
-+ }
-+}
-+
-+static av_always_inline void FUNC(add_residual)(uint8_t *_dst, int16_t *res,
-+ ptrdiff_t stride, int size)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size; x++) {
-+ dst[x] = av_clip_pixel(dst[x] + *res);
-+ res++;
-+ }
-+ dst += stride;
-+ }
-+}
-+
-+static av_always_inline void FUNC(add_residual_dc)(uint8_t *_dst, ptrdiff_t stride, const int dc, int size)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size; x++) {
-+ dst[x] = av_clip_pixel(dst[x] + dc);
-+ }
-+ dst += stride;
-+ }
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_u)(uint8_t *_dst, const int16_t *res,
-+ ptrdiff_t stride, const int dc_v, int size)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size * 2; x += 2) {
-+ dst[x] = av_clip_pixel(dst[x] + *res);
-+ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+ res++;
-+ }
-+ dst += stride;
-+ }
-+}
-+
-+static av_always_inline void FUNC(add_residual_v)(uint8_t *_dst, const int16_t *res,
-+ ptrdiff_t stride, const int dc_u, int size)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size * 2; x += 2) {
-+ dst[x] = av_clip_pixel(dst[x] + dc_u);
-+ dst[x + 1] = av_clip_pixel(dst[x + 1] + *res);
-+ res++;
-+ }
-+ dst += stride;
-+ }
-+}
-+
-+static av_always_inline void FUNC(add_residual_c)(uint8_t *_dst, const int16_t *res,
-+ ptrdiff_t stride, unsigned int size)
-+{
-+ unsigned int x, y;
-+ pixel *dst = (pixel *)_dst;
-+ const int16_t * ru = res;
-+ const int16_t * rv = res + size * size;
-+
-+// rpi_sand_dump16("ARC In Pred", _dst, stride, 0, 0, 0, size, size, 1);
-+// rpi_sand_dump16("ARC In RU", ru, size * 2, 0, 0, 0, size, size, 0);
-+// rpi_sand_dump16("ARC In RV", rv, size * 2, 0, 0, 0, size, size, 0);
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size * 2; x += 2) {
-+ dst[x + 0] = av_clip_pixel(dst[x + 0] + *ru++);
-+ dst[x + 1] = av_clip_pixel(dst[x + 1] + *rv++);
-+ }
-+ dst += stride;
-+ }
-+
-+// rpi_sand_dump16("ARC Out", _dst, stride * 2, 0, 0, 0, size, size, 1);
-+}
-+
-+
-+static av_always_inline void FUNC(add_residual_dc_c)(uint8_t *_dst, ptrdiff_t stride, const int32_t dc, int size)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+ const int dc_v = dc >> 16;
-+ const int dc_u = (dc << 16) >> 16;
-+
-+ stride /= sizeof(pixel);
-+
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size * 2; x += 2) {
-+ dst[x] = av_clip_pixel(dst[x] + dc_u);
-+ dst[x + 1] = av_clip_pixel(dst[x + 1] + dc_v);
-+ }
-+ dst += stride;
-+ }
-+}
-+
-+
-+static void FUNC(add_residual4x4)(uint8_t *_dst, int16_t *res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8)(uint8_t *_dst, int16_t *res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16)(uint8_t *_dst, int16_t *res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32)(uint8_t *_dst, int16_t *res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual)(_dst, res, stride, 32);
-+}
-+
-+static void FUNC(add_residual4x4_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+ FUNC(add_residual_dc)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+ FUNC(add_residual_dc)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+ FUNC(add_residual_dc)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc)(uint8_t *_dst, ptrdiff_t stride, int dc)
-+{
-+ FUNC(add_residual_dc)(_dst, stride, dc, 32);
-+}
-+
-+// -- U -- (plaited)
-+
-+static void FUNC(add_residual4x4_u)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_u)
-+{
-+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 4);
-+}
-+
-+static void FUNC(add_residual8x8_u)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_u)
-+{
-+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 8);
-+}
-+
-+static void FUNC(add_residual16x16_u)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_u)
-+{
-+ FUNC(add_residual_u)(_dst, res, stride, dc_u, 16);
-+}
-+
-+static void FUNC(add_residual32x32_u)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_u)
-+{
-+ // Should never occur for 420, which is all that sand supports
-+ av_assert0(0);
-+}
-+
-+// -- V -- (plaited)
-+
-+static void FUNC(add_residual4x4_v)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_v)
-+{
-+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 4);
-+}
-+
-+static void FUNC(add_residual8x8_v)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_v)
-+{
-+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 8);
-+}
-+
-+static void FUNC(add_residual16x16_v)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_v)
-+{
-+ FUNC(add_residual_v)(_dst, res, stride, dc_v, 16);
-+}
-+
-+static void FUNC(add_residual32x32_v)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride, int dc_v)
-+{
-+ // Should never occur for 420, which is all that sand supports
-+ av_assert0(0);
-+}
-+
-+// -- C -- (plaited - both U & V)
-+
-+static void FUNC(add_residual4x4_c)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual_c)(_dst, res, stride, 4);
-+}
-+
-+static void FUNC(add_residual8x8_c)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual_c)(_dst, res, stride, 8);
-+}
-+
-+static void FUNC(add_residual16x16_c)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride)
-+{
-+ FUNC(add_residual_c)(_dst, res, stride, 16);
-+}
-+
-+static void FUNC(add_residual32x32_c)(uint8_t *_dst, const int16_t * res,
-+ ptrdiff_t stride)
-+{
-+ // Should never occur for 420, which is all that sand supports
-+ av_assert0(0);
-+}
-+
-+static void FUNC(add_residual4x4_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+ FUNC(add_residual_dc_c)(_dst, stride, dc, 4);
-+}
-+
-+static void FUNC(add_residual8x8_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+ FUNC(add_residual_dc_c)(_dst, stride, dc, 8);
-+}
-+
-+static void FUNC(add_residual16x16_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+ FUNC(add_residual_dc_c)(_dst, stride, dc, 16);
-+}
-+
-+static void FUNC(add_residual32x32_dc_c)(uint8_t *_dst, ptrdiff_t stride, int32_t dc)
-+{
-+ // Should never occur for 420, which is all that sand supports
-+ av_assert0(0);
-+}
-+
-+
-+static void FUNC(transform_rdpcm)(int16_t *_coeffs, int16_t log2_size, int mode)
-+{
-+ int16_t *coeffs = (int16_t *) _coeffs;
-+ int x, y;
-+ int size = 1 << log2_size;
-+
-+ if (mode) {
-+ coeffs += size;
-+ for (y = 0; y < size - 1; y++) {
-+ for (x = 0; x < size; x++)
-+ coeffs[x] += coeffs[x - size];
-+ coeffs += size;
-+ }
-+ } else {
-+ for (y = 0; y < size; y++) {
-+ for (x = 1; x < size; x++)
-+ coeffs[x] += coeffs[x - 1];
-+ coeffs += size;
-+ }
-+ }
-+}
-+
-+static void FUNC(dequant)(int16_t *coeffs, int16_t log2_size)
-+{
-+ int shift = 15 - BIT_DEPTH - log2_size;
-+ int x, y;
-+ int size = 1 << log2_size;
-+
-+ if (shift > 0) {
-+ int offset = 1 << (shift - 1);
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size; x++) {
-+ *coeffs = (*coeffs + offset) >> shift;
-+ coeffs++;
-+ }
-+ }
-+ } else {
-+ for (y = 0; y < size; y++) {
-+ for (x = 0; x < size; x++) {
-+ *coeffs = *coeffs << -shift;
-+ coeffs++;
-+ }
-+ }
-+ }
-+}
-+
-+#define SET(dst, x) (dst) = (x)
-+#define SCALE(dst, x) (dst) = av_clip_int16(((x) + add) >> shift)
-+
-+#define TR_4x4_LUMA(dst, src, step, assign) \
-+ do { \
-+ int c0 = src[0 * step] + src[2 * step]; \
-+ int c1 = src[2 * step] + src[3 * step]; \
-+ int c2 = src[0 * step] - src[3 * step]; \
-+ int c3 = 74 * src[1 * step]; \
-+ \
-+ assign(dst[2 * step], 74 * (src[0 * step] - \
-+ src[2 * step] + \
-+ src[3 * step])); \
-+ assign(dst[0 * step], 29 * c0 + 55 * c1 + c3); \
-+ assign(dst[1 * step], 55 * c2 - 29 * c1 + c3); \
-+ assign(dst[3 * step], 55 * c0 + 29 * c2 - c3); \
-+ } while (0)
-+
-+static void FUNC(transform_4x4_luma)(int16_t *coeffs)
-+{
-+ int i;
-+ int shift = 7;
-+ int add = 1 << (shift - 1);
-+ int16_t *src = coeffs;
-+
-+ for (i = 0; i < 4; i++) {
-+ TR_4x4_LUMA(src, src, 4, SCALE);
-+ src++;
-+ }
-+
-+ shift = 20 - BIT_DEPTH;
-+ add = 1 << (shift - 1);
-+ for (i = 0; i < 4; i++) {
-+ TR_4x4_LUMA(coeffs, coeffs, 1, SCALE);
-+ coeffs += 4;
-+ }
-+}
-+
-+#undef TR_4x4_LUMA
-+
-+#define TR_4(dst, src, dstep, sstep, assign, end) \
-+ do { \
-+ const int e0 = 64 * src[0 * sstep] + 64 * src[2 * sstep]; \
-+ const int e1 = 64 * src[0 * sstep] - 64 * src[2 * sstep]; \
-+ const int o0 = 83 * src[1 * sstep] + 36 * src[3 * sstep]; \
-+ const int o1 = 36 * src[1 * sstep] - 83 * src[3 * sstep]; \
-+ \
-+ assign(dst[0 * dstep], e0 + o0); \
-+ assign(dst[1 * dstep], e1 + o1); \
-+ assign(dst[2 * dstep], e1 - o1); \
-+ assign(dst[3 * dstep], e0 - o0); \
-+ } while (0)
-+
-+#define TR_8(dst, src, dstep, sstep, assign, end) \
-+ do { \
-+ int i, j; \
-+ int e_8[4]; \
-+ int o_8[4] = { 0 }; \
-+ for (i = 0; i < 4; i++) \
-+ for (j = 1; j < end; j += 2) \
-+ o_8[i] += transform[4 * j][i] * src[j * sstep]; \
-+ TR_4(e_8, src, 1, 2 * sstep, SET, 4); \
-+ \
-+ for (i = 0; i < 4; i++) { \
-+ assign(dst[i * dstep], e_8[i] + o_8[i]); \
-+ assign(dst[(7 - i) * dstep], e_8[i] - o_8[i]); \
-+ } \
-+ } while (0)
-+
-+#define TR_16(dst, src, dstep, sstep, assign, end) \
-+ do { \
-+ int i, j; \
-+ int e_16[8]; \
-+ int o_16[8] = { 0 }; \
-+ for (i = 0; i < 8; i++) \
-+ for (j = 1; j < end; j += 2) \
-+ o_16[i] += transform[2 * j][i] * src[j * sstep]; \
-+ TR_8(e_16, src, 1, 2 * sstep, SET, 8); \
-+ \
-+ for (i = 0; i < 8; i++) { \
-+ assign(dst[i * dstep], e_16[i] + o_16[i]); \
-+ assign(dst[(15 - i) * dstep], e_16[i] - o_16[i]); \
-+ } \
-+ } while (0)
-+
-+#define TR_32(dst, src, dstep, sstep, assign, end) \
-+ do { \
-+ int i, j; \
-+ int e_32[16]; \
-+ int o_32[16] = { 0 }; \
-+ for (i = 0; i < 16; i++) \
-+ for (j = 1; j < end; j += 2) \
-+ o_32[i] += transform[j][i] * src[j * sstep]; \
-+ TR_16(e_32, src, 1, 2 * sstep, SET, end / 2); \
-+ \
-+ for (i = 0; i < 16; i++) { \
-+ assign(dst[i * dstep], e_32[i] + o_32[i]); \
-+ assign(dst[(31 - i) * dstep], e_32[i] - o_32[i]); \
-+ } \
-+ } while (0)
-+
-+#define IDCT_VAR4(H) \
-+ int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR8(H) \
-+ int limit = FFMIN(col_limit, H); \
-+ int limit2 = FFMIN(col_limit + 4, H)
-+#define IDCT_VAR16(H) IDCT_VAR8(H)
-+#define IDCT_VAR32(H) IDCT_VAR8(H)
-+
-+#define IDCT(H) \
-+static void FUNC(idct_ ## H ## x ## H )(int16_t *coeffs, \
-+ int col_limit) \
-+{ \
-+ int i; \
-+ int shift = 7; \
-+ int add = 1 << (shift - 1); \
-+ int16_t *src = coeffs; \
-+ IDCT_VAR ## H(H); \
-+ \
-+ for (i = 0; i < H; i++) { \
-+ TR_ ## H(src, src, H, H, SCALE, limit2); \
-+ if (limit2 < H && i%4 == 0 && !!i) \
-+ limit2 -= 4; \
-+ src++; \
-+ } \
-+ \
-+ shift = 20 - BIT_DEPTH; \
-+ add = 1 << (shift - 1); \
-+ for (i = 0; i < H; i++) { \
-+ TR_ ## H(coeffs, coeffs, 1, 1, SCALE, limit); \
-+ coeffs += H; \
-+ } \
-+}
-+
-+#define IDCT_DC(H) \
-+static void FUNC(idct_ ## H ## x ## H ## _dc)(int16_t *coeffs) \
-+{ \
-+ int i, j; \
-+ int shift = 14 - BIT_DEPTH; \
-+ int add = 1 << (shift - 1); \
-+ int coeff = (((coeffs[0] + 1) >> 1) + add) >> shift; \
-+ \
-+ for (j = 0; j < H; j++) { \
-+ for (i = 0; i < H; i++) { \
-+ coeffs[i + j * H] = coeff; \
-+ } \
-+ } \
-+}
-+
-+IDCT( 4)
-+IDCT( 8)
-+IDCT(16)
-+IDCT(32)
-+
-+IDCT_DC( 4)
-+IDCT_DC( 8)
-+IDCT_DC(16)
-+IDCT_DC(32)
-+
-+#undef TR_4
-+#undef TR_8
-+#undef TR_16
-+#undef TR_32
-+
-+#undef SET
-+#undef SCALE
-+
-+static void FUNC(sao_band_filter)(uint8_t *_dst, uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ int16_t *sao_offset_val, int sao_left_class,
-+ int width, int height)
-+{
-+ pixel *dst = (pixel *)_dst;
-+ pixel *src = (pixel *)_src;
-+ int offset_table[32] = { 0 };
-+ int k, y, x;
-+ int shift = BIT_DEPTH - 5;
-+
-+ stride_dst /= sizeof(pixel);
-+ stride_src /= sizeof(pixel);
-+
-+ for (k = 0; k < 4; k++)
-+ offset_table[(k + sao_left_class) & 31] = sao_offset_val[k + 1];
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(src[x] + offset_table[src[x] >> shift]);
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+}
-+
-+#define CMP(a, b) (((a) > (b)) - ((a) < (b)))
-+
-+static void FUNC(sao_edge_filter)(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val,
-+ int eo, int width, int height) {
-+
-+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+ static const int8_t pos[4][2][2] = {
-+ { { -1, 0 }, { 1, 0 } }, // horizontal
-+ { { 0, -1 }, { 0, 1 } }, // vertical
-+ { { -1, -1 }, { 1, 1 } }, // 45 degree
-+ { { 1, -1 }, { -1, 1 } }, // 135 degree
-+ };
-+ pixel *dst = (pixel *)_dst;
-+ pixel *src = (pixel *)_src;
-+ int a_stride, b_stride;
-+ int x, y;
-+ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+ stride_dst /= sizeof(pixel);
-+
-+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
-+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++) {
-+ int diff0 = CMP(src[x], src[x + a_stride]);
-+ int diff1 = CMP(src[x], src[x + b_stride]);
-+ int offset_val = edge_idx[2 + diff0 + diff1];
-+ dst[x] = av_clip_pixel(src[x] + sao_offset_val[offset_val]);
-+ }
-+ src += stride_src;
-+ dst += stride_dst;
-+ }
-+}
-+
-+
-+#if BIT_DEPTH == 10
-+// We need a 32 bit variation for the _c restores so hijack bit depth 10
-+#undef pixel
-+#undef BIT_DEPTH
-+#define pixel uint32_t
-+#define BIT_DEPTH 32
-+// All 16 bit variations are the same
-+#define sao_edge_restore_0_10 sao_edge_restore_0_9
-+#define sao_edge_restore_1_10 sao_edge_restore_1_9
-+#define sao_edge_restore_0_11 sao_edge_restore_0_9
-+#define sao_edge_restore_1_11 sao_edge_restore_1_9
-+#define sao_edge_restore_0_12 sao_edge_restore_0_9
-+#define sao_edge_restore_1_12 sao_edge_restore_1_9
-+#define sao_edge_restore_0_13 sao_edge_restore_0_9
-+#define sao_edge_restore_1_13 sao_edge_restore_1_9
-+#define sao_edge_restore_0_14 sao_edge_restore_0_9
-+#define sao_edge_restore_1_14 sao_edge_restore_1_9
-+#define sao_edge_restore_0_15 sao_edge_restore_0_9
-+#define sao_edge_restore_1_15 sao_edge_restore_1_9
-+#define sao_edge_restore_0_16 sao_edge_restore_0_9
-+#define sao_edge_restore_1_16 sao_edge_restore_1_9
-+#endif
-+#if BIT_DEPTH <= 9 || BIT_DEPTH == 32
-+static void FUNC(sao_edge_restore_0)(uint8_t *_dst, uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+ int *borders, int _width, int _height,
-+ int c_idx, uint8_t *vert_edge,
-+ uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+ pixel *src = (pixel *)_src;
-+ int sao_eo_class = sao->eo_class[c_idx];
-+ int init_x = 0, width = _width, height = _height;
-+
-+ stride_dst /= sizeof(pixel);
-+ stride_src /= sizeof(pixel);
-+
-+ if (sao_eo_class != SAO_EO_VERT) {
-+ if (borders[0]) {
-+ for (y = 0; y < height; y++) {
-+ dst[y * stride_dst] = src[y * stride_src];
-+ }
-+ init_x = 1;
-+ }
-+ if (borders[2]) {
-+ int offset = width - 1;
-+ for (x = 0; x < height; x++) {
-+ dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+ }
-+ width--;
-+ }
-+ }
-+ if (sao_eo_class != SAO_EO_HORIZ) {
-+ if (borders[1]) {
-+ for (x = init_x; x < width; x++)
-+ dst[x] = src[x];
-+ }
-+ if (borders[3]) {
-+ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+ ptrdiff_t y_stride_src = stride_src * (height - 1);
-+ for (x = init_x; x < width; x++)
-+ dst[x + y_stride_dst] = src[x + y_stride_src];
-+ height--;
-+ }
-+ }
-+}
-+
-+static void FUNC(sao_edge_restore_1)(uint8_t *_dst, uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src, RpiSAOParams *sao,
-+ int *borders, int _width, int _height,
-+ int c_idx, uint8_t *vert_edge,
-+ uint8_t *horiz_edge, uint8_t *diag_edge)
-+{
-+ int x, y;
-+ pixel *dst = (pixel *)_dst;
-+ pixel *src = (pixel *)_src;
-+ int sao_eo_class = sao->eo_class[c_idx];
-+ int init_x = 0, init_y = 0, width = _width, height = _height;
-+
-+ stride_dst /= sizeof(pixel);
-+ stride_src /= sizeof(pixel);
-+
-+ if (sao_eo_class != SAO_EO_VERT) {
-+ if (borders[0]) {
-+ for (y = 0; y < height; y++) {
-+ dst[y * stride_dst] = src[y * stride_src];
-+ }
-+ init_x = 1;
-+ }
-+ if (borders[2]) {
-+ int offset = width - 1;
-+ for (x = 0; x < height; x++) {
-+ dst[x * stride_dst + offset] = src[x * stride_src + offset];
-+ }
-+ width--;
-+ }
-+ }
-+ if (sao_eo_class != SAO_EO_HORIZ) {
-+ if (borders[1]) {
-+ for (x = init_x; x < width; x++)
-+ dst[x] = src[x];
-+ init_y = 1;
-+ }
-+ if (borders[3]) {
-+ ptrdiff_t y_stride_dst = stride_dst * (height - 1);
-+ ptrdiff_t y_stride_src = stride_src * (height - 1);
-+ for (x = init_x; x < width; x++)
-+ dst[x + y_stride_dst] = src[x + y_stride_src];
-+ height--;
-+ }
-+ }
-+
-+ {
-+ int save_upper_left = !diag_edge[0] && sao_eo_class == SAO_EO_135D && !borders[0] && !borders[1];
-+ int save_upper_right = !diag_edge[1] && sao_eo_class == SAO_EO_45D && !borders[1] && !borders[2];
-+ int save_lower_right = !diag_edge[2] && sao_eo_class == SAO_EO_135D && !borders[2] && !borders[3];
-+ int save_lower_left = !diag_edge[3] && sao_eo_class == SAO_EO_45D && !borders[0] && !borders[3];
-+
-+ // Restore pixels that can't be modified
-+ if(vert_edge[0] && sao_eo_class != SAO_EO_VERT) {
-+ for(y = init_y+save_upper_left; y< height-save_lower_left; y++)
-+ dst[y*stride_dst] = src[y*stride_src];
-+ }
-+ if(vert_edge[1] && sao_eo_class != SAO_EO_VERT) {
-+ for(y = init_y+save_upper_right; y< height-save_lower_right; y++)
-+ dst[y*stride_dst+width-1] = src[y*stride_src+width-1];
-+ }
-+
-+ if(horiz_edge[0] && sao_eo_class != SAO_EO_HORIZ) {
-+ for(x = init_x+save_upper_left; x < width-save_upper_right; x++)
-+ dst[x] = src[x];
-+ }
-+ if(horiz_edge[1] && sao_eo_class != SAO_EO_HORIZ) {
-+ for(x = init_x+save_lower_left; x < width-save_lower_right; x++)
-+ dst[(height-1)*stride_dst+x] = src[(height-1)*stride_src+x];
-+ }
-+ if(diag_edge[0] && sao_eo_class == SAO_EO_135D)
-+ dst[0] = src[0];
-+ if(diag_edge[1] && sao_eo_class == SAO_EO_45D)
-+ dst[width-1] = src[width-1];
-+ if(diag_edge[2] && sao_eo_class == SAO_EO_135D)
-+ dst[stride_dst*(height-1)+width-1] = src[stride_src*(height-1)+width-1];
-+ if(diag_edge[3] && sao_eo_class == SAO_EO_45D)
-+ dst[stride_dst*(height-1)] = src[stride_src*(height-1)];
-+
-+ }
-+}
-+#endif
-+#if BIT_DEPTH == 32
-+#undef BIT_DEPTH
-+#undef pixel
-+#define BIT_DEPTH 10
-+#define pixel uint16_t
-+#endif
-+
-+// --- Plaited chroma versions
-+
-+static void FUNC(sao_band_filter_c)(uint8_t *_dst, const uint8_t *_src,
-+ ptrdiff_t stride_dst, ptrdiff_t stride_src,
-+ const int16_t *sao_offset_val_u, int sao_left_class_u,
-+ const int16_t *sao_offset_val_v, int sao_left_class_v,
-+ int width, int height)
-+{
-+ pixel *dst = (pixel *)_dst;
-+ pixel *src = (pixel *)_src;
-+ int offset_table_u[32] = { 0 };
-+ int offset_table_v[32] = { 0 };
-+ int k, y, x;
-+ int shift = BIT_DEPTH - 5;
-+
-+ stride_dst /= sizeof(pixel);
-+ stride_src /= sizeof(pixel);
-+ width *= 2;
-+
-+ for (k = 0; k < 4; k++)
-+ {
-+ offset_table_u[(k + sao_left_class_u) & 31] = sao_offset_val_u[k + 1];
-+ offset_table_v[(k + sao_left_class_v) & 31] = sao_offset_val_v[k + 1];
-+ }
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x += 2)
-+ {
-+// printf("dst=%p, src=%p, x=%d, shift=%d\n", dst, src, x, shift);
-+// printf("offsets=%x,%x\n", src[x + 0], src[x + 1]);
-+ // *** & 31 shouldn't be wanted but just now we generate broken input that
-+ // crashes us in 10-bit world
-+ dst[x + 0] = av_clip_pixel(src[x + 0] + offset_table_u[(src[x + 0] >> shift) & 31]);
-+ dst[x + 1] = av_clip_pixel(src[x + 1] + offset_table_v[(src[x + 1] >> shift) & 31]);
-+ }
-+ dst += stride_dst;
-+ src += stride_src;
-+ }
-+}
-+
-+static void FUNC(sao_edge_filter_c)(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst,
-+ const int16_t *sao_offset_val_u, const int16_t *sao_offset_val_v,
-+ int eo, int width, int height) {
-+
-+ static const uint8_t edge_idx[] = { 1, 2, 0, 3, 4 };
-+ static const int8_t pos[4][2][2] = {
-+ { { -1, 0 }, { 1, 0 } }, // horizontal
-+ { { 0, -1 }, { 0, 1 } }, // vertical
-+ { { -1, -1 }, { 1, 1 } }, // 45 degree
-+ { { 1, -1 }, { -1, 1 } }, // 135 degree
-+ };
-+ pixel *dst = (pixel *)_dst;
-+ pixel *src = (pixel *)_src;
-+ int a_stride, b_stride;
-+ int x, y;
-+ const ptrdiff_t stride_src = RPI_HEVC_SAO_BUF_STRIDE / sizeof(pixel);
-+
-+ stride_dst /= sizeof(pixel);
-+ width *= 2;
-+
-+ av_assert0(width <= 64);
-+
-+ a_stride = pos[eo][0][0] * 2 + pos[eo][0][1] * stride_src;
-+ b_stride = pos[eo][1][0] * 2 + pos[eo][1][1] * stride_src;
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x += 2) {
-+ int diff0u = CMP(src[x], src[x + a_stride]);
-+ int diff1u = CMP(src[x], src[x + b_stride]);
-+ int offset_valu = edge_idx[2 + diff0u + diff1u];
-+ int diff0v = CMP(src[x+1], src[x+1 + a_stride]);
-+ int diff1v = CMP(src[x+1], src[x+1 + b_stride]);
-+ int offset_valv = edge_idx[2 + diff0v + diff1v];
-+ dst[x] = av_clip_pixel(src[x] + sao_offset_val_u[offset_valu]);
-+ dst[x+1] = av_clip_pixel(src[x+1] + sao_offset_val_v[offset_valv]);
-+ }
-+ src += stride_src;
-+ dst += stride_dst;
-+ }
-+}
-+
-+// Do once
-+#if BIT_DEPTH == 8
-+// Any old 2 byte 'normal' restore will work for these
-+#define sao_edge_restore_c_0_8 sao_edge_restore_0_16
-+#define sao_edge_restore_c_1_8 sao_edge_restore_1_16
-+// We need 32 bit for 9 bit+
-+#define sao_edge_restore_c_0_9 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_9 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_10 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_10 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_11 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_11 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_12 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_12 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_13 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_13 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_14 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_14 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_15 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_15 sao_edge_restore_1_32
-+#define sao_edge_restore_c_0_16 sao_edge_restore_0_32
-+#define sao_edge_restore_c_1_16 sao_edge_restore_1_32
-+#endif
-+
-+#undef CMP
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+static void FUNC(put_hevc_pel_pixels)(int16_t *dst,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = src[x] << (14 - BIT_DEPTH);
-+ src += srcstride;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ for (y = 0; y < height; y++) {
-+ memcpy(dst, src, width * sizeof(pixel));
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((src[x] << (14 - BIT_DEPTH)) + src2[x] + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_pel_uni_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel((((src[x] << (14 - BIT_DEPTH)) * wx + offset) >> shift) + ox);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_pel_bi_w_pixels)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++) {
-+ dst[x] = av_clip_pixel(( (src[x] << (14 - BIT_DEPTH)) * wx1 + src2[x] * wx0 + (ox0 + ox1 + 1) * (1 << log2Wd)) >> (log2Wd + 1));
-+ }
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define QPEL_FILTER(src, stride) \
-+ (filter[0] * src[x - 3 * stride] + \
-+ filter[1] * src[x - 2 * stride] + \
-+ filter[2] * src[x - stride] + \
-+ filter[3] * src[x ] + \
-+ filter[4] * src[x + stride] + \
-+ filter[5] * src[x + 2 * stride] + \
-+ filter[6] * src[x + 3 * stride] + \
-+ filter[7] * src[x + 4 * stride])
-+
-+static void FUNC(put_hevc_qpel_h)(int16_t *dst,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_v)(int16_t *dst,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_hv)(int16_t *dst,
-+ uint8_t *_src,
-+ ptrdiff_t _srcstride,
-+ int height, intptr_t mx,
-+ intptr_t my, int width)
-+{
-+ int x, y;
-+ const int8_t *filter;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+
-+ src -= QPEL_EXTRA_BEFORE * srcstride;
-+ filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ for (y = 0; y < height + QPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_qpel_filters[my - 1];
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+ tmp += MAX_PB_SIZE;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
-+ int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+
-+static void FUNC(put_hevc_qpel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ const int8_t *filter;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = 14 - BIT_DEPTH;
-+
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ src -= QPEL_EXTRA_BEFORE * srcstride;
-+ filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ for (y = 0; y < height + QPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ const int8_t *filter;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ src -= QPEL_EXTRA_BEFORE * srcstride;
-+ filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ for (y = 0; y < height + QPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox,
-+ intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel((((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox,
-+ intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel((((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+
-+ const int8_t *filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox,
-+ intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ const int8_t *filter;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ src -= QPEL_EXTRA_BEFORE * srcstride;
-+ filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ for (y = 0; y < height + QPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel((((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_qpel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ const int8_t *filter;
-+ pixel *src = (pixel*)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int16_t tmp_array[(MAX_PB_SIZE + QPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ src -= QPEL_EXTRA_BEFORE * srcstride;
-+ filter = ff_hevc_rpi_qpel_filters[mx - 1];
-+ for (y = 0; y < height + QPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = QPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + QPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_qpel_filters[my - 1];
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((QPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+////////////////////////////////////////////////////////////////////////////////
-+//
-+////////////////////////////////////////////////////////////////////////////////
-+#define EPEL_FILTER(src, stride) \
-+ (filter[0] * src[x - stride] + \
-+ filter[1] * src[x] + \
-+ filter[2] * src[x + stride] + \
-+ filter[3] * src[x + 2 * stride])
-+
-+static void FUNC(put_hevc_epel_h)(int16_t *dst,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_v)(int16_t *dst,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_hv)(int16_t *dst,
-+ uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+
-+ src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+ for (y = 0; y < height + EPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6;
-+ tmp += MAX_PB_SIZE;
-+ dst += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++) {
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+ }
-+ dst += dststride;
-+ src += srcstride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+ int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + offset) >> shift);
-+ src += srcstride;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) + src2[x] + offset) >> shift);
-+ dst += dststride;
-+ src += srcstride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+ for (y = 0; y < height + EPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + offset) >> shift);
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = 14 + 1 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+ for (y = 0; y < height + EPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) + src2[x] + offset) >> shift);
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++) {
-+ dst[x] = av_clip_pixel((((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+ }
-+ dst += dststride;
-+ src += srcstride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_h)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++) {
-+ dst[x] = av_clip_pixel((((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx + offset) >> shift) + ox);
-+ }
-+ dst += dststride;
-+ src += srcstride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_v)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[my - 1];
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(src, srcstride) >> (BIT_DEPTH - 8)) * wx1 + src2[x] * wx0 +
-+ ((ox0 + ox1 + 1) << log2Wd)) >> (log2Wd + 1));
-+ src += srcstride;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_uni_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int height, int denom, int wx, int ox, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = denom + 14 - BIT_DEPTH;
-+#if BIT_DEPTH < 14
-+ int offset = 1 << (shift - 1);
-+#else
-+ int offset = 0;
-+#endif
-+
-+ src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+ for (y = 0; y < height + EPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+ ox = ox * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel((((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx + offset) >> shift) + ox);
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ }
-+}
-+
-+static void FUNC(put_hevc_epel_bi_w_hv)(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride,
-+ int16_t *src2,
-+ int height, int denom, int wx0, int wx1,
-+ int ox0, int ox1, intptr_t mx, intptr_t my, int width)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ ptrdiff_t srcstride = _srcstride / sizeof(pixel);
-+ pixel *dst = (pixel *)_dst;
-+ ptrdiff_t dststride = _dststride / sizeof(pixel);
-+ const int8_t *filter = ff_hevc_rpi_epel_filters[mx - 1];
-+ int16_t tmp_array[(MAX_PB_SIZE + EPEL_EXTRA) * MAX_PB_SIZE];
-+ int16_t *tmp = tmp_array;
-+ int shift = 14 + 1 - BIT_DEPTH;
-+ int log2Wd = denom + shift - 1;
-+
-+ src -= EPEL_EXTRA_BEFORE * srcstride;
-+
-+ for (y = 0; y < height + EPEL_EXTRA; y++) {
-+ for (x = 0; x < width; x++)
-+ tmp[x] = EPEL_FILTER(src, 1) >> (BIT_DEPTH - 8);
-+ src += srcstride;
-+ tmp += MAX_PB_SIZE;
-+ }
-+
-+ tmp = tmp_array + EPEL_EXTRA_BEFORE * MAX_PB_SIZE;
-+ filter = ff_hevc_rpi_epel_filters[my - 1];
-+
-+ ox0 = ox0 * (1 << (BIT_DEPTH - 8));
-+ ox1 = ox1 * (1 << (BIT_DEPTH - 8));
-+ for (y = 0; y < height; y++) {
-+ for (x = 0; x < width; x++)
-+ dst[x] = av_clip_pixel(((EPEL_FILTER(tmp, MAX_PB_SIZE) >> 6) * wx1 + src2[x] * wx0 +
-+ ((ox0 + ox1 + 1) * (1 << log2Wd))) >> (log2Wd + 1));
-+ tmp += MAX_PB_SIZE;
-+ dst += dststride;
-+ src2 += MAX_PB_SIZE;
-+ }
-+}
-+
-+// line zero
-+#define P3 pix[-4 * xstride]
-+#define P2 pix[-3 * xstride]
-+#define P1 pix[-2 * xstride]
-+#define P0 pix[-1 * xstride]
-+#define Q0 pix[0 * xstride]
-+#define Q1 pix[1 * xstride]
-+#define Q2 pix[2 * xstride]
-+#define Q3 pix[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix[-4 * xstride + 3 * ystride]
-+#define TP2 pix[-3 * xstride + 3 * ystride]
-+#define TP1 pix[-2 * xstride + 3 * ystride]
-+#define TP0 pix[-1 * xstride + 3 * ystride]
-+#define TQ0 pix[0 * xstride + 3 * ystride]
-+#define TQ1 pix[1 * xstride + 3 * ystride]
-+#define TQ2 pix[2 * xstride + 3 * ystride]
-+#define TQ3 pix[3 * xstride + 3 * ystride]
-+
-+static void FUNC(hevc_loop_filter_luma)(uint8_t *_pix,
-+ ptrdiff_t _xstride, ptrdiff_t _ystride,
-+ int beta, int *_tc,
-+ uint8_t *_no_p, uint8_t *_no_q)
-+{
-+ int d, j;
-+ pixel *pix = (pixel *)_pix;
-+ ptrdiff_t xstride = _xstride / sizeof(pixel);
-+ ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+ beta <<= BIT_DEPTH - 8;
-+
-+ for (j = 0; j < 2; j++) {
-+ const int dp0 = abs(P2 - 2 * P1 + P0);
-+ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
-+ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
-+ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
-+ const int d0 = dp0 + dq0;
-+ const int d3 = dp3 + dq3;
-+ const int tc = _tc[j] << (BIT_DEPTH - 8);
-+ const int no_p = _no_p[j];
-+ const int no_q = _no_q[j];
-+
-+ if (d0 + d3 >= beta) {
-+ pix += 4 * ystride;
-+ continue;
-+ } else {
-+ const int beta_3 = beta >> 3;
-+ const int beta_2 = beta >> 2;
-+ const int tc25 = ((tc * 5 + 1) >> 1);
-+
-+ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
-+ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
-+ // strong filtering
-+ const int tc2 = tc << 1;
-+ for (d = 0; d < 4; d++) {
-+ const int p3 = P3;
-+ const int p2 = P2;
-+ const int p1 = P1;
-+ const int p0 = P0;
-+ const int q0 = Q0;
-+ const int q1 = Q1;
-+ const int q2 = Q2;
-+ const int q3 = Q3;
-+ if (!no_p) {
-+ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+ }
-+ if (!no_q) {
-+ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+ }
-+ pix += ystride;
-+ }
-+ } else { // normal filtering
-+ int nd_p = 1;
-+ int nd_q = 1;
-+ const int tc_2 = tc >> 1;
-+ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+ nd_p = 2;
-+ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+ nd_q = 2;
-+
-+ for (d = 0; d < 4; d++) {
-+ const int p2 = P2;
-+ const int p1 = P1;
-+ const int p0 = P0;
-+ const int q0 = Q0;
-+ const int q1 = Q1;
-+ const int q2 = Q2;
-+ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+ if (abs(delta0) < 10 * tc) {
-+ delta0 = av_clip(delta0, -tc, tc);
-+ if (!no_p)
-+ P0 = av_clip_pixel(p0 + delta0);
-+ if (!no_q)
-+ Q0 = av_clip_pixel(q0 - delta0);
-+ if (!no_p && nd_p > 1) {
-+ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+ P1 = av_clip_pixel(p1 + deltap1);
-+ }
-+ if (!no_q && nd_q > 1) {
-+ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+ Q1 = av_clip_pixel(q1 + deltaq1);
-+ }
-+ }
-+ pix += ystride;
-+ }
-+ }
-+ }
-+ }
-+}
-+
-+static void FUNC(hevc_loop_filter_chroma)(uint8_t *_pix, ptrdiff_t _xstride,
-+ ptrdiff_t _ystride, int *_tc,
-+ uint8_t *_no_p, uint8_t *_no_q)
-+{
-+ int d, j, no_p, no_q;
-+ pixel *pix = (pixel *)_pix;
-+ ptrdiff_t xstride = _xstride / sizeof(pixel);
-+ ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+ for (j = 0; j < 2; j++) {
-+ const int tc = _tc[j] << (BIT_DEPTH - 8);
-+ if (tc <= 0) {
-+ pix += 4 * ystride;
-+ continue;
-+ }
-+ no_p = _no_p[j];
-+ no_q = _no_q[j];
-+
-+ for (d = 0; d < 4; d++) {
-+ int delta0;
-+ const int p1 = P1;
-+ const int p0 = P0;
-+ const int q0 = Q0;
-+ const int q1 = Q1;
-+ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+ if (!no_p)
-+ P0 = av_clip_pixel(p0 + delta0);
-+ if (!no_q)
-+ Q0 = av_clip_pixel(q0 - delta0);
-+ pix += ystride;
-+ }
-+ }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+ int32_t *tc, uint8_t *no_p,
-+ uint8_t *no_q)
-+{
-+ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel), tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_chroma)(uint8_t *pix, ptrdiff_t stride,
-+ int32_t *tc, uint8_t *no_p,
-+ uint8_t *no_q)
-+{
-+ FUNC(hevc_loop_filter_chroma)(pix, sizeof(pixel), stride, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+ int beta, int32_t *tc, uint8_t *no_p,
-+ uint8_t *no_q)
-+{
-+ FUNC(hevc_loop_filter_luma)(pix, stride, sizeof(pixel),
-+ beta, tc, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_luma)(uint8_t *pix, ptrdiff_t stride,
-+ int beta, int32_t *tc, uint8_t *no_p,
-+ uint8_t *no_q)
-+{
-+ FUNC(hevc_loop_filter_luma)(pix, sizeof(pixel), stride,
-+ beta, tc, no_p, no_q);
-+}
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+// line zero
-+#define P3 pix_l[0 * xstride]
-+#define P2 pix_l[1 * xstride]
-+#define P1 pix_l[2 * xstride]
-+#define P0 pix_l[3 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+#define Q2 pix_r[2 * xstride]
-+#define Q3 pix_r[3 * xstride]
-+
-+// line three. used only for deblocking decision
-+#define TP3 pix_l[0 * xstride + 3 * ystride]
-+#define TP2 pix_l[1 * xstride + 3 * ystride]
-+#define TP1 pix_l[2 * xstride + 3 * ystride]
-+#define TP0 pix_l[3 * xstride + 3 * ystride]
-+#define TQ0 pix_r[0 * xstride + 3 * ystride]
-+#define TQ1 pix_r[1 * xstride + 3 * ystride]
-+#define TQ2 pix_r[2 * xstride + 3 * ystride]
-+#define TQ3 pix_r[3 * xstride + 3 * ystride]
-+
-+// This is identical to hevc_loop_filter_luma except that the P/Q
-+// components are on separate pointers
-+static void FUNC(hevc_v_loop_filter_luma2)(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f,
-+ uint8_t * _pix_l)
-+{
-+ int d, j;
-+ pixel *pix_l = (pixel *)_pix_l;
-+ pixel *pix_r = (pixel *)_pix_r;
-+ const ptrdiff_t xstride = 1;
-+ const ptrdiff_t ystride = _stride / sizeof(pixel);
-+
-+ beta <<= BIT_DEPTH - 8;
-+
-+ for (j = 0; j < 2; j++) {
-+ const int dp0 = abs(P2 - 2 * P1 + P0);
-+ const int dq0 = abs(Q2 - 2 * Q1 + Q0);
-+ const int dp3 = abs(TP2 - 2 * TP1 + TP0);
-+ const int dq3 = abs(TQ2 - 2 * TQ1 + TQ0);
-+ const int d0 = dp0 + dq0;
-+ const int d3 = dp3 + dq3;
-+ const int tc = ((tc2 >> (j << 4)) & 0xffff) << (BIT_DEPTH - 8);
-+ const int no_p = no_f & 1;
-+ const int no_q = no_f & 2;
-+
-+ if (d0 + d3 >= beta) {
-+ pix_l += 4 * ystride;
-+ pix_r += 4 * ystride;
-+ continue;
-+ } else {
-+ const int beta_3 = beta >> 3;
-+ const int beta_2 = beta >> 2;
-+ const int tc25 = ((tc * 5 + 1) >> 1);
-+
-+ if (abs(P3 - P0) + abs(Q3 - Q0) < beta_3 && abs(P0 - Q0) < tc25 &&
-+ abs(TP3 - TP0) + abs(TQ3 - TQ0) < beta_3 && abs(TP0 - TQ0) < tc25 &&
-+ (d0 << 1) < beta_2 && (d3 << 1) < beta_2) {
-+ // strong filtering
-+ const int tc2 = tc << 1;
-+ for (d = 0; d < 4; d++) {
-+ const int p3 = P3;
-+ const int p2 = P2;
-+ const int p1 = P1;
-+ const int p0 = P0;
-+ const int q0 = Q0;
-+ const int q1 = Q1;
-+ const int q2 = Q2;
-+ const int q3 = Q3;
-+ if (!no_p) {
-+ P0 = p0 + av_clip(((p2 + 2 * p1 + 2 * p0 + 2 * q0 + q1 + 4) >> 3) - p0, -tc2, tc2);
-+ P1 = p1 + av_clip(((p2 + p1 + p0 + q0 + 2) >> 2) - p1, -tc2, tc2);
-+ P2 = p2 + av_clip(((2 * p3 + 3 * p2 + p1 + p0 + q0 + 4) >> 3) - p2, -tc2, tc2);
-+ }
-+ if (!no_q) {
-+ Q0 = q0 + av_clip(((p1 + 2 * p0 + 2 * q0 + 2 * q1 + q2 + 4) >> 3) - q0, -tc2, tc2);
-+ Q1 = q1 + av_clip(((p0 + q0 + q1 + q2 + 2) >> 2) - q1, -tc2, tc2);
-+ Q2 = q2 + av_clip(((2 * q3 + 3 * q2 + q1 + q0 + p0 + 4) >> 3) - q2, -tc2, tc2);
-+ }
-+ pix_l += ystride;
-+ pix_r += ystride;
-+ }
-+ } else { // normal filtering
-+ int nd_p = 1;
-+ int nd_q = 1;
-+ const int tc_2 = tc >> 1;
-+ if (dp0 + dp3 < ((beta + (beta >> 1)) >> 3))
-+ nd_p = 2;
-+ if (dq0 + dq3 < ((beta + (beta >> 1)) >> 3))
-+ nd_q = 2;
-+
-+ for (d = 0; d < 4; d++) {
-+ const int p2 = P2;
-+ const int p1 = P1;
-+ const int p0 = P0;
-+ const int q0 = Q0;
-+ const int q1 = Q1;
-+ const int q2 = Q2;
-+ int delta0 = (9 * (q0 - p0) - 3 * (q1 - p1) + 8) >> 4;
-+ if (abs(delta0) < 10 * tc) {
-+ delta0 = av_clip(delta0, -tc, tc);
-+ if (!no_p)
-+ P0 = av_clip_pixel(p0 + delta0);
-+ if (!no_q)
-+ Q0 = av_clip_pixel(q0 - delta0);
-+ if (!no_p && nd_p > 1) {
-+ const int deltap1 = av_clip((((p2 + p0 + 1) >> 1) - p1 + delta0) >> 1, -tc_2, tc_2);
-+ P1 = av_clip_pixel(p1 + deltap1);
-+ }
-+ if (!no_q && nd_q > 1) {
-+ const int deltaq1 = av_clip((((q2 + q0 + 1) >> 1) - q1 - delta0) >> 1, -tc_2, tc_2);
-+ Q1 = av_clip_pixel(q1 + deltaq1);
-+ }
-+ }
-+ pix_l += ystride;
-+ pix_r += ystride;
-+ }
-+ }
-+ }
-+ }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_luma2)(uint8_t * _pix_r,
-+ unsigned int _stride, unsigned int beta, unsigned int tc2, unsigned int no_f)
-+{
-+ // Just call the non-2 function having massaged the parameters
-+ int32_t tc[2] = {tc2 & 0xffff, tc2 >> 16};
-+ uint8_t no_p[2] = {no_f & 1, no_f & 1};
-+ uint8_t no_q[2] = {no_f & 2, no_f & 2};
-+ FUNC(hevc_h_loop_filter_luma)(_pix_r, _stride, beta, tc, no_p, no_q);
-+}
-+
-+#undef TP3
-+#undef TP2
-+#undef TP1
-+#undef TP0
-+#undef TQ0
-+#undef TQ1
-+#undef TQ2
-+#undef TQ3
-+
-+#undef P3
-+#undef P2
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+#undef Q2
-+#undef Q3
-+
-+#define P1 pix_l[0 * xstride]
-+#define P0 pix_l[1 * xstride]
-+#define Q0 pix_r[0 * xstride]
-+#define Q1 pix_r[1 * xstride]
-+
-+static void FUNC(hevc_loop_filter_uv2)(uint8_t *_pix_l, ptrdiff_t _xstride,
-+ ptrdiff_t _ystride, const int32_t *_tc,
-+ const uint8_t *_no_p, const uint8_t *_no_q, uint8_t *_pix_r)
-+{
-+ int d, j, no_p, no_q;
-+ pixel *pix_l = (pixel *)_pix_l;
-+ pixel *pix_r = (pixel *)_pix_r;
-+ ptrdiff_t xstride = _xstride / sizeof(pixel);
-+ ptrdiff_t ystride = _ystride / sizeof(pixel);
-+
-+ for (j = 0; j < 2; j++) {
-+ const int tc = _tc[j] << (BIT_DEPTH - 8);
-+ if (tc <= 0) {
-+ pix_l += 4 * ystride;
-+ pix_r += 4 * ystride;
-+ continue;
-+ }
-+ no_p = _no_p[j];
-+ no_q = _no_q[j];
-+
-+ for (d = 0; d < 4; d++) {
-+ int delta0;
-+ const int p1 = P1;
-+ const int p0 = P0;
-+ const int q0 = Q0;
-+ const int q1 = Q1;
-+ delta0 = av_clip((((q0 - p0) * 4) + p1 - q1 + 4) >> 3, -tc, tc);
-+ if (!no_p)
-+ P0 = av_clip_pixel(p0 + delta0);
-+ if (!no_q)
-+ Q0 = av_clip_pixel(q0 - delta0);
-+ pix_l += ystride;
-+ pix_r += ystride;
-+ }
-+ }
-+}
-+
-+static void FUNC(hevc_h_loop_filter_uv)(uint8_t * pix, unsigned int stride, uint32_t tc4,
-+ unsigned int no_f)
-+{
-+ uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+ uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+ FUNC(hevc_loop_filter_chroma)(pix, stride, sizeof(pixel) * 2, tc, no_p, no_q);
-+ FUNC(hevc_loop_filter_chroma)(pix + sizeof(pixel), stride, sizeof(pixel) * 2, tc + 2, no_p, no_q);
-+}
-+
-+static void FUNC(hevc_v_loop_filter_uv2)(uint8_t * src_r, unsigned int stride, uint32_t tc4,
-+ uint8_t * src_l,
-+ unsigned int no_f)
-+{
-+ uint8_t no_p[2] = {no_f & 1, no_f & 2};
-+ uint8_t no_q[2] = {no_f & 4, no_f & 8};
-+ int32_t tc[4] = {tc4 & 0xff, (tc4 >> 8) & 0xff, (tc4 >> 16) & 0xff, tc4 >> 24};
-+ FUNC(hevc_loop_filter_uv2)(src_l, sizeof(pixel) * 2, stride, tc, no_p, no_q, src_r);
-+ FUNC(hevc_loop_filter_uv2)(src_l + sizeof(pixel), sizeof(pixel) * 2, stride, tc + 2, no_p, no_q, src_r + sizeof(pixel));
-+}
-+
-+#undef P1
-+#undef P0
-+#undef Q0
-+#undef Q1
-+
-diff --git a/libavcodec/rpi_hevcpred.c b/libavcodec/rpi_hevcpred.c
-new file mode 100644
-index 0000000000..62135b83c2
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.c
-@@ -0,0 +1,166 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "rpi_hevcdec.h"
-+
-+#include "rpi_hevcpred.h"
-+#if (ARCH_ARM)
-+#include "arm/rpi_hevcpred_arm.h"
-+#endif
-+
-+#define PRED_C 0
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+#define PRED_C 1
-+#define BIT_DEPTH 8
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 9
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 10
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+
-+#define BIT_DEPTH 12
-+#include "rpi_hevcpred_template.c"
-+#undef BIT_DEPTH
-+#undef PRED_C
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth)
-+{
-+#undef FUNC
-+#define FUNC(a, depth) a ## _ ## depth
-+
-+#undef FUNCC
-+#define FUNCC(a, depth) a ## _ ## depth ## _c
-+
-+#define HEVC_PRED_Y(depth) \
-+ hpc->intra_pred[0] = FUNC(intra_pred_2, depth); \
-+ hpc->intra_pred[1] = FUNC(intra_pred_3, depth); \
-+ hpc->intra_pred[2] = FUNC(intra_pred_4, depth); \
-+ hpc->intra_pred[3] = FUNC(intra_pred_5, depth); \
-+ hpc->intra_filter[0] = FUNC(intra_filter_2, depth); \
-+ hpc->intra_filter[1] = FUNC(intra_filter_3, depth); \
-+ hpc->intra_filter[2] = FUNC(intra_filter_4, depth); \
-+ hpc->intra_filter[3] = FUNC(intra_filter_5, depth); \
-+ hpc->pred_planar[0] = FUNC(pred_planar_0, depth); \
-+ hpc->pred_planar[1] = FUNC(pred_planar_1, depth); \
-+ hpc->pred_planar[2] = FUNC(pred_planar_2, depth); \
-+ hpc->pred_planar[3] = FUNC(pred_planar_3, depth); \
-+ hpc->pred_dc[0] = FUNC(pred_dc_0, depth); \
-+ hpc->pred_dc[1] = FUNC(pred_dc_1, depth); \
-+ hpc->pred_dc[2] = FUNC(pred_dc_2, depth); \
-+ hpc->pred_dc[3] = FUNC(pred_dc_3, depth); \
-+ hpc->pred_vertical[0] = FUNC(pred_angular_0, depth); \
-+ hpc->pred_vertical[1] = FUNC(pred_angular_1, depth); \
-+ hpc->pred_vertical[2] = FUNC(pred_angular_2, depth); \
-+ hpc->pred_vertical[3] = FUNC(pred_angular_3, depth); \
-+ hpc->pred_horizontal[0] = FUNC(pred_angular_0, depth); \
-+ hpc->pred_horizontal[1] = FUNC(pred_angular_1, depth); \
-+ hpc->pred_horizontal[2] = FUNC(pred_angular_2, depth); \
-+ hpc->pred_horizontal[3] = FUNC(pred_angular_3, depth); \
-+ hpc->pred_angular[0] = FUNC(pred_angular_0, depth); \
-+ hpc->pred_angular[1] = FUNC(pred_angular_1, depth); \
-+ hpc->pred_angular[2] = FUNC(pred_angular_2, depth); \
-+ hpc->pred_angular[3] = FUNC(pred_angular_3, depth); \
-+ hpc->pred_dc0[0] = FUNC(pred_dc0_0, depth); \
-+ hpc->pred_dc0[1] = FUNC(pred_dc0_1, depth); \
-+ hpc->pred_dc0[2] = FUNC(pred_dc0_2, depth); \
-+ hpc->pred_dc0[3] = FUNC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED_C(depth) \
-+ hpc->intra_pred_c[0] = FUNCC(intra_pred_2, depth); \
-+ hpc->intra_pred_c[1] = FUNCC(intra_pred_3, depth); \
-+ hpc->intra_pred_c[2] = FUNCC(intra_pred_4, depth); \
-+ hpc->intra_pred_c[3] = FUNCC(intra_pred_5, depth); \
-+ hpc->intra_filter_c[0] = FUNCC(intra_filter_2, depth); \
-+ hpc->intra_filter_c[1] = FUNCC(intra_filter_3, depth); \
-+ hpc->intra_filter_c[2] = FUNCC(intra_filter_4, depth); \
-+ hpc->intra_filter_c[3] = FUNCC(intra_filter_5, depth); \
-+ hpc->pred_planar_c[0] = FUNCC(pred_planar_0, depth); \
-+ hpc->pred_planar_c[1] = FUNCC(pred_planar_1, depth); \
-+ hpc->pred_planar_c[2] = FUNCC(pred_planar_2, depth); \
-+ hpc->pred_planar_c[3] = FUNCC(pred_planar_3, depth); \
-+ hpc->pred_dc_c[0] = FUNCC(pred_dc_0, depth); \
-+ hpc->pred_dc_c[1] = FUNCC(pred_dc_1, depth); \
-+ hpc->pred_dc_c[2] = FUNCC(pred_dc_2, depth); \
-+ hpc->pred_dc_c[3] = FUNCC(pred_dc_3, depth); \
-+ hpc->pred_vertical_c[0] = FUNCC(pred_angular_0, depth); \
-+ hpc->pred_vertical_c[1] = FUNCC(pred_angular_1, depth); \
-+ hpc->pred_vertical_c[2] = FUNCC(pred_angular_2, depth); \
-+ hpc->pred_vertical_c[3] = FUNCC(pred_angular_3, depth); \
-+ hpc->pred_horizontal_c[0] = FUNCC(pred_angular_0, depth); \
-+ hpc->pred_horizontal_c[1] = FUNCC(pred_angular_1, depth); \
-+ hpc->pred_horizontal_c[2] = FUNCC(pred_angular_2, depth); \
-+ hpc->pred_horizontal_c[3] = FUNCC(pred_angular_3, depth); \
-+ hpc->pred_angular_c[0] = FUNCC(pred_angular_0, depth); \
-+ hpc->pred_angular_c[1] = FUNCC(pred_angular_1, depth); \
-+ hpc->pred_angular_c[2] = FUNCC(pred_angular_2, depth); \
-+ hpc->pred_angular_c[3] = FUNCC(pred_angular_3, depth); \
-+ hpc->pred_dc0_c[0] = FUNCC(pred_dc0_0, depth); \
-+ hpc->pred_dc0_c[1] = FUNCC(pred_dc0_1, depth); \
-+ hpc->pred_dc0_c[2] = FUNCC(pred_dc0_2, depth); \
-+ hpc->pred_dc0_c[3] = FUNCC(pred_dc0_3, depth);
-+
-+#define HEVC_PRED(depth) \
-+ HEVC_PRED_Y(depth); \
-+ HEVC_PRED_C(depth);
-+
-+ switch (bit_depth) {
-+ case 9:
-+ HEVC_PRED(9);
-+ break;
-+ case 10:
-+ HEVC_PRED(10);
-+ break;
-+ case 12:
-+ HEVC_PRED(12);
-+ break;
-+ default:
-+ HEVC_PRED(8);
-+ break;
-+ }
-+
-+#if (ARCH_ARM)
-+ ff_hevc_rpi_pred_init_arm(hpc, bit_depth);
-+#elif (ARCH_MIPS)
-+ ff_hevc_rpi_pred_init_mips(hpc, bit_depth);
-+#endif
-+}
-diff --git a/libavcodec/rpi_hevcpred.h b/libavcodec/rpi_hevcpred.h
-new file mode 100644
-index 0000000000..6e594277c0
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred.h
-@@ -0,0 +1,121 @@
-+/*
-+ * HEVC video Decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#ifndef AVCODEC_RPI_HEVCPRED_H
-+#define AVCODEC_RPI_HEVCPRED_H
-+
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "config.h"
-+
-+struct HEVCRpiContext;
-+struct HEVCRpiLocalContext;
-+
-+enum IntraPredMode {
-+ INTRA_PLANAR = 0,
-+ INTRA_DC,
-+ INTRA_ANGULAR_2,
-+ INTRA_ANGULAR_3,
-+ INTRA_ANGULAR_4,
-+ INTRA_ANGULAR_5,
-+ INTRA_ANGULAR_6,
-+ INTRA_ANGULAR_7,
-+ INTRA_ANGULAR_8,
-+ INTRA_ANGULAR_9,
-+ INTRA_ANGULAR_10,
-+ INTRA_ANGULAR_11,
-+ INTRA_ANGULAR_12,
-+ INTRA_ANGULAR_13,
-+ INTRA_ANGULAR_14,
-+ INTRA_ANGULAR_15,
-+ INTRA_ANGULAR_16,
-+ INTRA_ANGULAR_17,
-+ INTRA_ANGULAR_18,
-+ INTRA_ANGULAR_19,
-+ INTRA_ANGULAR_20,
-+ INTRA_ANGULAR_21,
-+ INTRA_ANGULAR_22,
-+ INTRA_ANGULAR_23,
-+ INTRA_ANGULAR_24,
-+ INTRA_ANGULAR_25,
-+ INTRA_ANGULAR_26,
-+ INTRA_ANGULAR_27,
-+ INTRA_ANGULAR_28,
-+ INTRA_ANGULAR_29,
-+ INTRA_ANGULAR_30,
-+ INTRA_ANGULAR_31,
-+ INTRA_ANGULAR_32,
-+ INTRA_ANGULAR_33,
-+ INTRA_ANGULAR_34,
-+};
-+#define INTRA_ANGULAR_HORIZONTAL INTRA_ANGULAR_10
-+#define INTRA_ANGULAR_VERTICAL INTRA_ANGULAR_26
-+
-+typedef void intra_filter_fn_t(
-+ uint8_t * const left, uint8_t * const top,
-+ const unsigned int req, const unsigned int avail,
-+ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur,
-+ const unsigned int stride,
-+ const unsigned int top_right_size, const unsigned int down_left_size);
-+
-+typedef struct HEVCRpiPredContext {
-+ void (*intra_pred[4])(const struct HEVCRpiContext * const s,
-+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail);
-+
-+ intra_filter_fn_t *intra_filter[4];
-+ void (*pred_planar[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride);
-+ void (*pred_dc[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+ ptrdiff_t stride);
-+ void (*pred_angular[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride,
-+ int mode);
-+ void (*pred_vertical[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride,
-+ int mode);
-+ void (*pred_horizontal[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride,
-+ int mode);
-+ void (*pred_dc0[4])(uint8_t *src, ptrdiff_t stride);
-+
-+ void (*intra_pred_c[4])(const struct HEVCRpiContext * const s,
-+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail);
-+ intra_filter_fn_t *intra_filter_c[4];
-+ void (*pred_planar_c[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride);
-+ void (*pred_dc_c[4])(uint8_t *src, const uint8_t *top, const uint8_t *left,
-+ ptrdiff_t stride);
-+ void (*pred_angular_c[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride,
-+ int mode);
-+ void (*pred_vertical_c[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride,
-+ int mode);
-+ void (*pred_horizontal_c[4])(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left, ptrdiff_t stride,
-+ int mode);
-+ void (*pred_dc0_c[4])(uint8_t *src, ptrdiff_t stride);
-+} HEVCRpiPredContext;
-+
-+void ff_hevc_rpi_pred_init(HEVCRpiPredContext *hpc, int bit_depth);
-+
-+#endif /* AVCODEC_RPI_HEVCPRED_H */
-diff --git a/libavcodec/rpi_hevcpred_template.c b/libavcodec/rpi_hevcpred_template.c
-new file mode 100644
-index 0000000000..2f710626cf
---- /dev/null
-+++ b/libavcodec/rpi_hevcpred_template.c
-@@ -0,0 +1,1522 @@
-+/*
-+ * HEVC video decoder
-+ *
-+ * Copyright (C) 2012 - 2013 Guillaume Martres
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include "config.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include "bit_depth_template.c"
-+
-+#include "rpi_hevcdec.h"
-+#include "rpi_hevcpred.h"
-+
-+#define DUMP_PRED 0
-+
-+#define POS(x, y) src[(x) + stride * (y)]
-+
-+// INCLUDED_ONCE defined at EOF
-+#ifndef INCLUDED_ONCE
-+typedef uint8_t (* c8_dst_ptr_t)[2];
-+typedef const uint8_t (* c8_src_ptr_t)[2];
-+typedef uint16_t (* c16_dst_ptr_t)[2];
-+typedef const uint16_t (* c16_src_ptr_t)[2];
-+
-+// *** On ARM make these NEON registers
-+typedef struct pixel4_16 {
-+ uint16_t x[4];
-+} pixel4_16;
-+typedef struct pixel4_32 {
-+ uint32_t x[4];
-+} pixel4_32;
-+static inline pixel4_16 PIXEL_SPLAT_X4_16(const uint16_t x)
-+{
-+ pixel4_16 t = {{x, x, x, x}};
-+ return t;
-+}
-+static inline pixel4_32 PIXEL_SPLAT_X4_32(const uint32_t x)
-+{
-+ pixel4_32 t = {{x, x, x, x}};
-+ return t;
-+}
-+#endif
-+
-+#if PRED_C
-+// For chroma we double pixel size so we copy pairs
-+#undef pixel
-+#undef pixel2
-+#undef pixel4
-+#undef dctcoef
-+#undef INIT_CLIP
-+#undef no_rnd_avg_pixel4
-+#undef rnd_avg_pixel4
-+#undef AV_RN2P
-+#undef AV_RN4P
-+#undef AV_RN4PA
-+#undef AV_WN2P
-+#undef AV_WN4P
-+#undef AV_WN4PA
-+#undef CLIP
-+#undef FUNC
-+#undef FUNCC
-+#undef av_clip_pixel
-+#undef PIXEL_SPLAT_X4
-+
-+#if BIT_DEPTH == 8
-+#define pixel uint16_t
-+#define pixel4 pixel4_16
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_16
-+#define cpel uint8_t
-+#define c_src_ptr_t c8_src_ptr_t
-+#define c_dst_ptr_t c8_dst_ptr_t
-+#else
-+#define pixel uint32_t
-+#define pixel4 pixel4_32
-+#define PIXEL_SPLAT_X4 PIXEL_SPLAT_X4_32
-+#define cpel uint16_t
-+#define c_src_ptr_t c16_dst_ptr_t
-+#define c_dst_ptr_t c16_dst_ptr_t
-+#endif
-+#define AV_RN4P(p) (*(pixel4*)(p))
-+#define AV_WN4P(p,x) (*(pixel4*)(p) = (x))
-+#define FUNC(a) FUNC2(a, BIT_DEPTH, _c)
-+#endif
-+
-+
-+// Get PW prior to horrid PRED_C trickery
-+#if BIT_DEPTH == 8
-+#define PW 1
-+#else
-+#define PW 2
-+#endif
-+
-+
-+#if DUMP_PRED && !defined(INCLUDED_ONCE)
-+static void dump_pred_uv(const uint8_t * data, const unsigned int stride, const unsigned int size)
-+{
-+ for (unsigned int y = 0; y != size; y++, data += stride * 2) {
-+ for (unsigned int x = 0; x != size; x++) {
-+ printf("%4d", data[x * 2]);
-+ }
-+ printf("\n");
-+ }
-+ printf("\n");
-+}
-+#endif
-+
-+#ifndef INCLUDED_ONCE
-+static inline void extend_8(void * ptr, const unsigned int v, unsigned int n)
-+{
-+ if ((n >>= 2) != 0) {
-+ uint32_t v4 = v | (v << 8);
-+ uint32_t * p = (uint32_t *)ptr;
-+ v4 = v4 | (v4 << 16);
-+ do {
-+ *p++ = v4;
-+ } while (--n != 0);
-+ }
-+}
-+
-+static inline void extend_16(void * ptr, const unsigned int v, unsigned int n)
-+{
-+ if ((n >>= 2) != 0) {
-+ uint32_t v2 = v | (v << 16);
-+ uint32_t * p = (uint32_t *)ptr;
-+ do {
-+ *p++ = v2;
-+ *p++ = v2;
-+ } while (--n != 0);
-+ }
-+}
-+
-+static inline void extend_32(void * ptr, const unsigned int v, unsigned int n)
-+{
-+ if ((n >>= 2) != 0) {
-+ uint32_t * p = (uint32_t *)ptr;
-+ do {
-+ *p++ = v;
-+ *p++ = v;
-+ *p++ = v;
-+ *p++ = v;
-+ } while (--n != 0);
-+ }
-+}
-+
-+// Beware that this inverts the avail ordering
-+// For CIP it seems easier this way round
-+static unsigned int cip_avail_l(const uint8_t * is_intra, const int i_stride, const unsigned int i_mask,
-+ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+ unsigned int s0, unsigned int odd_s)
-+{
-+ const unsigned int n = 1 << log2_intra_bits;
-+ unsigned int fa = 0;
-+ unsigned int i;
-+
-+ size >>= 2; // Now in 4-pel units
-+ s0 >>= 2;
-+
-+ if ((avail & AVAIL_DL) != 0)
-+ fa |= ((1 << s0) - 1) << (size - s0);
-+ if ((avail & AVAIL_L) != 0)
-+ fa |= ((1 << size) - 1) << size;
-+ if ((avail & AVAIL_UL) != 0)
-+ fa |= 1 << (size << 1);
-+
-+ if (odd_s) {
-+ if ((fa & 1) != 0 && (*is_intra & i_mask) == 0)
-+ fa &= ~1;
-+ is_intra += i_stride;
-+ }
-+
-+ for (i = odd_s; (fa >> i) != 0; i += n, is_intra += i_stride) {
-+ const unsigned int m = ((1 << n) - 1) << i;
-+ if ((fa & m) != 0 && (*is_intra & i_mask) == 0)
-+ fa &= ~m;
-+ }
-+
-+ return fa;
-+}
-+
-+static unsigned int cip_avail_u(const uint8_t * is_intra, unsigned int i_shift,
-+ const unsigned int log2_intra_bits, const unsigned int avail, unsigned int size,
-+ unsigned int s1, unsigned int odd_s)
-+{
-+ if ((avail & (AVAIL_U | AVAIL_UR)) == 0)
-+ {
-+ return 0;
-+ }
-+ else
-+ {
-+ const unsigned int n = 1 << log2_intra_bits;
-+ unsigned int fa = 0;
-+ unsigned int i;
-+ unsigned int im = ((is_intra[1] << 8) | (is_intra[0])) >> i_shift;
-+
-+ size >>= 2; // Now in 4-pel units
-+ s1 >>= 2;
-+
-+ if ((avail & AVAIL_U) != 0)
-+ fa |= ((1 << size) - 1);
-+ if ((avail & AVAIL_UR) != 0)
-+ fa |= ((1 << s1) - 1) << size;
-+
-+ if (odd_s) {
-+ fa &= im | ~1;
-+ im >>= 1;
-+ }
-+
-+ for (i = odd_s; (fa >> i) != 0; i += n, im >>= 1) {
-+ const unsigned int m = ((1 << n) - 1) << i;
-+ if ((im & 1) == 0)
-+ fa &= ~m;
-+ }
-+ return fa;
-+ }
-+}
-+
-+
-+
-+static inline unsigned int rmbd(unsigned int x)
-+{
-+#if 1
-+ return __builtin_ctz(x);
-+#else
-+ unsigned int n = 0;
-+ if ((x & 0xffff) == 0) {
-+ x >>= 16;
-+ n += 16;
-+ }
-+ if ((x & 0xff) == 0) {
-+ x >>= 8;
-+ n += 8;
-+ }
-+ if ((x & 0xf) == 0) {
-+ x >>= 4;
-+ n += 4;
-+ }
-+ if ((x & 0x3) == 0) {
-+ x >>= 2;
-+ n += 2;
-+ }
-+
-+ return (x & 1) == 0 ? n + 1 : n;
-+#endif
-+}
-+#endif
-+
-+
-+static void FUNC(cip_fill)(pixel * const left, pixel * const top,
-+ const unsigned int avail_l, const unsigned int avail_u,
-+ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+ const unsigned int stride,
-+ const unsigned int size)
-+{
-+ pixel a;
-+ unsigned int i;
-+
-+ // 1st find DL value
-+ if ((avail_l & 1) == 0) {
-+ if (avail_l != 0)
-+ a = src_l[((int)size * 2 - 1 - (int)rmbd(avail_l)*4) * (int)stride];
-+ else
-+ {
-+ // (avail_l | avail_u) != 0 so this must be good
-+ const unsigned int n = rmbd(avail_u)*4;
-+ a = (n >= size) ? src_ur[n - size] : src_u[n];
-+ }
-+ }
-+
-+ // L
-+ {
-+ pixel * d = left + size * 2 - 1;
-+ const pixel * s = src_l + (size * 2 - 1) * stride;
-+ unsigned int x = avail_l;
-+ for (i = 0; i < size * 2; i += 4, x >>= 1)
-+ {
-+ if ((x & 1) != 0) {
-+ // Avail
-+ *d-- = *s;
-+ s -= stride;
-+ *d-- = *s;
-+ s -= stride;
-+ *d-- = *s;
-+ s -= stride;
-+ *d-- = a = *s;
-+ s -= stride;
-+ }
-+ else
-+ {
-+ *d-- = a;
-+ *d-- = a;
-+ *d-- = a;
-+ *d-- = a;
-+ s -= stride * 4;
-+ }
-+ }
-+ // UL
-+ *d = a = (x & 1) != 0 ? *s : a;
-+ }
-+
-+ // U
-+ {
-+ pixel * d = top;
-+ const pixel * s = src_u;
-+ unsigned int x = avail_u;
-+
-+ for (i = 0; i < size; i += 4, x >>= 1)
-+ {
-+ if ((x & 1) != 0) {
-+ // Avail
-+ *d++ = *s++;
-+ *d++ = *s++;
-+ *d++ = *s++;
-+ *d++ = a = *s++;
-+ }
-+ else
-+ {
-+ *d++ = a;
-+ *d++ = a;
-+ *d++ = a;
-+ *d++ = a;
-+ s += 4;
-+ }
-+ }
-+
-+ // UR
-+ s = src_ur;
-+ for (i = 0; i < size; i += 4, x >>= 1)
-+ {
-+ if ((x & 1) != 0) {
-+ // Avail
-+ *d++ = *s++;
-+ *d++ = *s++;
-+ *d++ = *s++;
-+ *d++ = a = *s++;
-+ }
-+ else
-+ {
-+ *d++ = a;
-+ *d++ = a;
-+ *d++ = a;
-+ *d++ = a;
-+ s += 4;
-+ }
-+ }
-+ }
-+}
-+
-+
-+#if !PRED_C && PW == 1
-+#define EXTEND(ptr, val, len) extend_8(ptr, val, len)
-+#elif (!PRED_C && PW == 2) || (PRED_C && PW == 1)
-+#define EXTEND(ptr, val, len) extend_16(ptr, val, len)
-+#else
-+#define EXTEND(ptr, val, len) extend_32(ptr, val, len)
-+#endif
-+
-+// Reqs:
-+//
-+// Planar: DL[0], L, ul, U, UR[0]
-+// DC: dl, L, ul, U, ur
-+// A2-9: DL, L, ul, u, ur
-+// A10: dl, L, ul, u, ur
-+// A11-17 dl, L, UL, U, ur
-+// A18-25 dl, L, Ul, U, ur
-+// A26 dl, l, ul, U, ur
-+// A27-34 dl, l, ul, U, UR
-+
-+#ifndef INCLUDED_ONCE
-+
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_8;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_4_neon_16;
-+intra_filter_fn_t ff_hevc_rpi_intra_filter_8_neon_16;
-+
-+#define FILTER_LIGHT 0x40
-+#define FILTER_STRONG 0x80
-+#define FILTER_EITHER (FILTER_LIGHT | FILTER_STRONG)
-+
-+static const uint8_t req_avail_c[35] =
-+{
-+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
-+ AVAIL_L | 0 | AVAIL_U, // DC
-+ AVAIL_DL | AVAIL_L, // 2
-+ AVAIL_DL | AVAIL_L, // 3
-+ AVAIL_DL | AVAIL_L, // 4
-+ AVAIL_DL | AVAIL_L, // 5
-+ AVAIL_DL | AVAIL_L, // 6
-+ AVAIL_DL | AVAIL_L, // 7
-+ AVAIL_DL | AVAIL_L, // 8
-+ AVAIL_DL | AVAIL_L, // 9
-+ AVAIL_L, // 10 (H)
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
-+ AVAIL_U, // 26 (V)
-+ AVAIL_U | AVAIL_UR, // 27
-+ AVAIL_U | AVAIL_UR, // 28
-+ AVAIL_U | AVAIL_UR, // 29
-+ AVAIL_U | AVAIL_UR, // 30
-+ AVAIL_U | AVAIL_UR, // 31
-+ AVAIL_U | AVAIL_UR, // 32
-+ AVAIL_U | AVAIL_UR, // 33
-+ AVAIL_U | AVAIL_UR // 34
-+};
-+
-+static const uint8_t req_avail[4][35] = {
-+{
-+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR, // Planar (DL[0] & UR[0] only needed)
-+ AVAIL_L | 0 | AVAIL_U, // DC
-+ AVAIL_DL | AVAIL_L, // 2
-+ AVAIL_DL | AVAIL_L, // 3
-+ AVAIL_DL | AVAIL_L, // 4
-+ AVAIL_DL | AVAIL_L, // 5
-+ AVAIL_DL | AVAIL_L, // 6
-+ AVAIL_DL | AVAIL_L, // 7
-+ AVAIL_DL | AVAIL_L, // 8
-+ AVAIL_DL | AVAIL_L, // 9
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 10 (H)
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 11
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 12
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 13
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 14
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 15
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 16
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 17
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 18
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 19
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 20
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 21
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 22
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 23
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 24
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 25
-+ AVAIL_L | AVAIL_UL | AVAIL_U, // 26 (V)
-+ AVAIL_U | AVAIL_UR, // 27
-+ AVAIL_U | AVAIL_UR, // 28
-+ AVAIL_U | AVAIL_UR, // 29
-+ AVAIL_U | AVAIL_UR, // 30
-+ AVAIL_U | AVAIL_UR, // 31
-+ AVAIL_U | AVAIL_UR, // 32
-+ AVAIL_U | AVAIL_UR, // 33
-+ AVAIL_U | AVAIL_UR // 34
-+},
-+{ // 3
-+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
-+ AVAIL_L | 0 | AVAIL_U, // DC
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
-+ AVAIL_DL | AVAIL_L | 0, // 3
-+ AVAIL_DL | AVAIL_L | 0, // 4
-+ AVAIL_DL | AVAIL_L | 0, // 5
-+ AVAIL_DL | AVAIL_L | 0, // 6
-+ AVAIL_DL | AVAIL_L | 0, // 7
-+ AVAIL_DL | AVAIL_L | 0, // 8
-+ AVAIL_DL | AVAIL_L | 0, // 9
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 12
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 13
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 14
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 15
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 16
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 17
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 19
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 20
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 21
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 22
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 23
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 24
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
-+ AVAIL_U | AVAIL_UR | 0, // 27
-+ AVAIL_U | AVAIL_UR | 0, // 28
-+ AVAIL_U | AVAIL_UR | 0, // 29
-+ AVAIL_U | AVAIL_UR | 0, // 30
-+ AVAIL_U | AVAIL_UR | 0, // 31
-+ AVAIL_U | AVAIL_UR | 0, // 32
-+ AVAIL_U | AVAIL_UR | 0, // 33
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
-+},
-+{ // 4
-+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_LIGHT, // Planar (DL[0] & UR[0] only needed)
-+ AVAIL_L | 0 | AVAIL_U, // DC
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 2
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 3
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 4
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 5
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 6
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 7
-+ AVAIL_DL | AVAIL_L | FILTER_LIGHT, // 8
-+ AVAIL_DL | AVAIL_L | 0, // 9
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 10 (H)
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 11
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 12
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 13
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 14
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 15
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 16
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 17
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 18
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 19
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 20
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 21
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 22
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 23
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_LIGHT, // 24
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 25
-+ AVAIL_L | AVAIL_UL | AVAIL_U | 0, // 26 (V)
-+ AVAIL_U | AVAIL_UR | 0, // 27
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 28
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 29
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 30
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 31
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 32
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT, // 33
-+ AVAIL_U | AVAIL_UR | FILTER_LIGHT // 34
-+},
-+{ // 5
-+ AVAIL_DL | AVAIL_L | 0 | AVAIL_U | AVAIL_UR | FILTER_EITHER, // Planar (DL[0] & UR[0] only needed)
-+ AVAIL_L | 0 | AVAIL_U, // DC
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 2
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 3
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 4
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 5
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 6
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 7
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 8
-+ AVAIL_DL | AVAIL_L | FILTER_EITHER, // 9
-+ AVAIL_L | 0, // 10 (H)
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 11
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 12
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 13
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 14
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 15
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 16
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 17
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 18
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 19
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 20
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 21
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 22
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 23
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 24
-+ AVAIL_L | AVAIL_UL | AVAIL_U | FILTER_EITHER, // 25
-+ AVAIL_U | 0, // 26 (V)
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 27
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 28
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 29
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 30
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 31
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 32
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER, // 33
-+ AVAIL_U | AVAIL_UR | FILTER_EITHER // 34
-+}
-+};
-+
-+
-+#endif
-+
-+#define filter_light1 FUNC(filter_light1)
-+static inline pixel filter_light1(pixel a, pixel b, pixel c)
-+{
-+ return (a + b*2 + c + 2) >> 2;
-+}
-+
-+#define filter_light FUNC(filter_light)
-+static inline void filter_light(pixel * dst, pixel p1, const pixel * src, const pixel pn, const int sstride, const unsigned int n)
-+{
-+ pixel p0;
-+ pixel p2 = *src;
-+ // Allow for final pel - it is just clearer to to have the call take the actual number of output pels
-+ unsigned int n_minus_1 = n - 1;
-+
-+ do
-+ {
-+ src += sstride;
-+ p0 = p1;
-+ p1 = p2;
-+ p2 = *src;
-+ *dst++ = filter_light1(p0, p1, p2);
-+ } while (--n_minus_1 != 0);
-+ *dst = filter_light1(p1, p2, pn);
-+}
-+
-+#define filter_strong FUNC(filter_strong)
-+static inline void filter_strong(pixel * dst, const unsigned int p0, const unsigned int p1, unsigned int n)
-+{
-+ unsigned int a = 64 * p0 + 32;
-+ const int v = p1 - p0;
-+
-+ do
-+ {
-+ *dst++ = (a += v) >> 6;
-+ } while (--n != 0);
-+}
-+
-+#define intra_filter FUNC(intra_filter)
-+static av_always_inline void intra_filter(
-+ pixel * const left, pixel * const top,
-+ const unsigned int req, const unsigned int avail,
-+ const pixel * const src_l, const pixel * const src_u, const pixel * const src_ur,
-+ const unsigned int stride,
-+ const unsigned int top_right_size, const unsigned int down_left_size,
-+ const unsigned int log2_size)
-+{
-+ const unsigned int strong_threshold = 1 << (BIT_DEPTH - 5);
-+ const unsigned int size = 1 << log2_size;
-+
-+ // a_ is the first pel in a section working round dl -> ur
-+ // b_ is the last
-+ // Beware that top & left work out from UL so usage of a_ & b_ may
-+ // swap between them. It is a bad naming scheme but I have found no
-+ // better
-+ const pixel * a_dl = src_l + (down_left_size + size - 1) * stride;
-+ const pixel * b_dl = src_l + size * stride;
-+ const pixel * a_l = src_l + (size - 1) * stride;
-+ const pixel * b_l = src_l;
-+ const pixel * ab_ul = src_l - stride;
-+ const pixel * a_u = src_u;
-+ const pixel * b_u = src_u + size - 1;
-+ const pixel * a_ur = src_ur;
-+ const pixel * b_ur = src_ur + top_right_size - 1;
-+
-+ const unsigned int want = req & ~avail;
-+ const unsigned int have = req & avail;
-+ unsigned int i;
-+
-+ if ((avail & AVAIL_DL) == 0)
-+ {
-+ a_dl = a_ur;
-+ if ((avail & AVAIL_U) != 0)
-+ a_dl = a_u;
-+ if ((avail & AVAIL_UL) != 0)
-+ a_dl = ab_ul;
-+ if ((avail & AVAIL_L) != 0)
-+ a_dl = a_l;
-+ b_dl = a_dl;
-+ }
-+
-+ if ((avail & AVAIL_L) == 0)
-+ {
-+ a_l = b_dl;
-+ b_l = b_dl;
-+ }
-+ if ((avail & AVAIL_UL) == 0)
-+ {
-+ ab_ul = b_l;
-+ }
-+ if ((avail & AVAIL_U) == 0)
-+ {
-+ a_u = ab_ul;
-+ b_u = ab_ul;
-+ }
-+ if ((avail & AVAIL_UR) == 0)
-+ {
-+ a_ur = b_u;
-+ b_ur = b_u;
-+ }
-+
-+ if ((req & FILTER_LIGHT) == 0 || PRED_C || log2_size == 2) // PRED_C, log2_size compiler opt hints
-+ {
-+ if ((req & AVAIL_UL) != 0)
-+ left[-1] = *ab_ul;
-+
-+ if ((want & AVAIL_L) != 0)
-+ EXTEND(left, *a_l, size);
-+ if ((want & AVAIL_DL) != 0)
-+ EXTEND(left + size, *a_dl, size);
-+ if ((want & AVAIL_U) != 0)
-+ EXTEND(top, *a_u, size);
-+ if ((want & AVAIL_UR) != 0)
-+ EXTEND(top + size, *a_ur, size);
-+
-+ if ((have & AVAIL_U) != 0)
-+ // Always good - even with sand
-+ memcpy(top, a_u, size * sizeof(pixel));
-+ if ((have & AVAIL_UR) != 0)
-+ {
-+ memcpy(top + size, a_ur, top_right_size * sizeof(pixel));
-+ EXTEND(top + size + top_right_size, *b_ur,
-+ size - top_right_size);
-+ }
-+ if ((have & AVAIL_L) != 0)
-+ {
-+ for (i = 0; i < size; i++)
-+ left[i] = b_l[stride * i];
-+ }
-+ if ((have & AVAIL_DL) != 0)
-+ {
-+ for (i = 0; i < down_left_size; i++)
-+ left[i + size] = b_dl[stride * i];
-+ EXTEND(left + size + down_left_size, *a_dl,
-+ size - down_left_size);
-+ }
-+ }
-+ else if ((req & FILTER_STRONG) != 0 && log2_size == 5 && // log2_size compiler opt hint
-+ FFABS((int)(*a_dl - *a_l * 2 + *ab_ul)) < strong_threshold &&
-+ FFABS((int)(*ab_ul - *b_u * 2 + *b_ur)) < strong_threshold)
-+ {
-+ if ((req & (AVAIL_U | AVAIL_UR)) != 0)
-+ filter_strong(top, *ab_ul, *b_ur, size * 2);
-+ left[-1] = *ab_ul;
-+ if ((req & (AVAIL_L | AVAIL_DL)) != 0)
-+ filter_strong(left, *ab_ul, *a_dl, size*2);
-+ }
-+ else
-+ {
-+ // Same code for both have & want for UL
-+ if ((req & AVAIL_UL) != 0)
-+ {
-+ left[-1] = filter_light1(*b_l, *ab_ul, *a_u);
-+ }
-+
-+ if ((want & AVAIL_L) != 0)
-+ {
-+ EXTEND(left, *a_l, size);
-+ left[0] = (*a_l * 3 + *ab_ul + 2) >> 2;
-+ }
-+ if ((want & AVAIL_DL) != 0)
-+ {
-+ // If we want DL then it cannot be avail so a_dl = a_l so no edge rounding
-+ EXTEND(left + size, *a_l, size);
-+ }
-+ if ((want & AVAIL_U) != 0)
-+ {
-+ EXTEND(top, *a_u, size);
-+ top[size - 1] = (*a_u * 3 + *a_ur + 2) >> 2;
-+ }
-+ if ((want & AVAIL_UR) != 0)
-+ {
-+ // If we want UR then it cannot be avail so a_ur = b_u so no edge rounding
-+ EXTEND(top + size, *a_ur, size);
-+ }
-+
-+ if ((have & AVAIL_U) != 0)
-+ {
-+ filter_light(top, *ab_ul, a_u, *a_ur, 1, size);
-+ }
-+ if ((have & AVAIL_UR) != 0) {
-+ filter_light(top + size, *b_u, a_ur, *b_ur, 1, top_right_size);
-+ top[size*2 - 1] = *b_ur;
-+ EXTEND(top + size + top_right_size, *b_ur, size - top_right_size);
-+ }
-+ if ((have & AVAIL_L) != 0)
-+ {
-+ filter_light(left, *ab_ul, b_l, *b_dl, stride, size);
-+ }
-+ if ((have & AVAIL_DL) != 0)
-+ {
-+ filter_light(left + size, *a_l, b_dl, *a_dl, stride, down_left_size);
-+ left[size*2 - 1] = *a_dl;
-+ EXTEND(left + size + down_left_size, *a_dl, size - down_left_size);
-+ }
-+ }
-+}
-+
-+#define INTRA_FILTER(log2_size) \
-+static void FUNC(intra_filter_ ## log2_size)( \
-+ uint8_t * const left, uint8_t * const top, \
-+ const unsigned int req, const unsigned int avail, \
-+ const uint8_t * const src_l, const uint8_t * const src_u, const uint8_t * const src_ur, \
-+ const unsigned int stride, \
-+ const unsigned int top_right_size, const unsigned int down_left_size) \
-+{ \
-+ intra_filter((pixel *)left, (pixel *)top, req, avail, \
-+ (const pixel *)src_l, (const pixel *)src_u, (const pixel *)src_ur, stride / sizeof(pixel), top_right_size, down_left_size, log2_size); \
-+}
-+
-+INTRA_FILTER(2)
-+INTRA_FILTER(3)
-+INTRA_FILTER(4)
-+INTRA_FILTER(5)
-+
-+#undef intra_filter
-+#undef INTRA_FILTER
-+
-+static av_always_inline void FUNC(intra_pred)(const HEVCRpiContext * const s,
-+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail,
-+ const unsigned int log2_size)
-+{
-+ // c_idx will alaways be 1 for _c versions and 0 for y
-+ const unsigned int c_idx = PRED_C;
-+ const unsigned int hshift = ctx_hshift(s, c_idx);
-+ const unsigned int vshift = ctx_vshift(s, c_idx);
-+ const unsigned int size = (1 << log2_size);
-+ const unsigned int x = x0 >> hshift;
-+ const unsigned int y = y0 >> vshift;
-+
-+ const ptrdiff_t stride = frame_stride1(s->frame, c_idx) / sizeof(pixel);
-+ pixel *const src = c_idx == 0 ?
-+ (pixel *)av_rpi_sand_frame_pos_y(s->frame, x, y) :
-+ (pixel *)av_rpi_sand_frame_pos_c(s->frame, x, y);
-+
-+ // Align so we can do multiple loads in the asm
-+ // Padded to 16 byte boundary so as not to confuse anything
-+ DECLARE_ALIGNED(16, pixel, left_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
-+ DECLARE_ALIGNED(16, pixel, top_array[2 * MAX_TB_SIZE + 16 / sizeof(pixel)]);
-+
-+ pixel * const left = left_array + 16 / sizeof(pixel);
-+ pixel * const top = top_array + 16 / sizeof(pixel);
-+ const pixel * top_pred = top;
-+
-+ const pixel * src_l = src - 1;
-+ const pixel * src_u = src - stride;
-+ const pixel * src_ur = src_u + size;
-+#if !PRED_C
-+ unsigned int req = req_avail[log2_size - 2][mode];
-+#else
-+ unsigned int req = req_avail_c[mode];
-+#endif
-+
-+ // If we have nothing to pred from then fill with grey
-+ // This isn't a common case but dealing with it here means we don't have to
-+ // test for it later
-+ if (avail == 0)
-+ {
-+dc_only:
-+#if !PRED_C
-+ s->hpc.pred_dc0[log2_size - 2]((uint8_t *)src, stride);
-+#else
-+ s->hpc.pred_dc0_c[log2_size - 2]((uint8_t *)src, stride);
-+#endif
-+ return;
-+ }
-+
-+ // There will be no filtering on C so no point worrying about disabling it
-+#if !PRED_C
-+ if (s->ps.sps->intra_smoothing_disabled_flag)
-+ req &= ~FILTER_EITHER;
-+ if (!s->ps.sps->sps_strong_intra_smoothing_enable_flag)
-+ req &= ~FILTER_STRONG;
-+#endif
-+
-+ {
-+ // N.B. stride is in pixels (not bytes) or in the case of chroma pixel-pairs
-+ const AVFrame * const frame = s->frame;
-+ const unsigned int mask = stride - 1; // For chroma pixel=uint16 so stride_c is stride_y / 2
-+ const unsigned int stripe_adj = (av_rpi_sand_frame_stride2(frame) - 1) * stride;
-+ if ((x & mask) == 0)
-+ src_l -= stripe_adj;
-+ if (((x + size) & mask) == 0)
-+ src_ur += stripe_adj;
-+ }
-+
-+ // Can deal with I-slices in 'normal' code even if CIP
-+ // This also means that we don't need to generate (elsewhere) is_intra
-+ // for IRAP frames
-+ if (s->ps.pps->constrained_intra_pred_flag == 1 &&
-+ s->sh.slice_type != HEVC_SLICE_I)
-+ {
-+ // * If we ever actually care about CIP performance then we should
-+ // special case out size 4 stuff (can be done by 'normal') and
-+ // have 8-pel avail masks
-+ unsigned int avail_l = cip_avail_l(s->is_intra + ((y + size * 2 - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + ((x - 1) >> (6 - hshift)),
-+ -(int)(s->ps.sps->pcm_width),
-+ 1 << (((x - 1) >> (3 - hshift)) & 7),
-+ 1 - hshift,
-+ avail,
-+ size,
-+ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size),
-+ vshift != 0 ? 0 : (y >> 2) & 1);
-+
-+ unsigned int avail_u = cip_avail_u(s->is_intra + ((y - 1) >> (3 - vshift)) * s->ps.sps->pcm_width + (x >> (6 - hshift)),
-+ (x >> (3 - hshift)) & 7,
-+ 1 - hshift,
-+ avail,
-+ size,
-+ FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size),
-+ hshift != 0 ? 0 : (x >> 2) & 1);
-+
-+ // Anything left?
-+ if ((avail_l | avail_u) == 0)
-+ goto dc_only;
-+
-+ FUNC(cip_fill)(left, top, avail_l, avail_u, src_l, src_u, src_ur, stride, size);
-+
-+#if !PRED_C
-+ if ((req & FILTER_LIGHT) != 0)
-+ {
-+ const unsigned threshold = 1 << (BIT_DEPTH - 5);
-+ if ((req & FILTER_STRONG) != 0 &&
-+ (int)(FFABS(left[-1] + top[63] - 2 * top[31])) < threshold &&
-+ (int)(FFABS(left[-1] + left[63] - 2 * left[31])) < threshold)
-+ {
-+ filter_strong(top, left[-1], top[63], 64);
-+ filter_strong(left, left[-1], left[63], 64);
-+ } else
-+ {
-+ // LHS writes UL too so copy for top
-+ const pixel p_ul = left[-1];
-+ filter_light(left - 1, top[0], left - 1, left[2*size - 1], 1, 2*size);
-+ filter_light(top, p_ul, top, top[2*size - 1], 1, 2*size - 1);
-+ }
-+ }
-+#endif
-+ }
-+ else
-+ {
-+ const unsigned int ur_size = FFMIN(size, ((s->ps.sps->width - x0) >> hshift) - size);
-+ if ((req & ~((AVAIL_UR | AVAIL_U) & avail)) == 0 &&
-+ ((req & AVAIL_UR) == 0 || src_u + 2*size == src_ur + ur_size))
-+ {
-+ top_pred = src_u;
-+ }
-+ else
-+ {
-+#if !PRED_C
-+ s->hpc.intra_filter[log2_size - 2]
-+#else
-+ s->hpc.intra_filter_c[log2_size - 2]
-+#endif
-+ ((uint8_t *)left, (uint8_t *)top, req, avail,
-+ (const uint8_t *)src_l, (const uint8_t *)src_u, (const uint8_t *)src_ur, stride * sizeof(pixel),
-+ ur_size,
-+ FFMIN(size, ((s->ps.sps->height - y0) >> vshift) - size));
-+ }
-+ }
-+
-+
-+#if !PRED_C
-+ switch (mode) {
-+ case INTRA_PLANAR:
-+ s->hpc.pred_planar[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride);
-+ break;
-+ case INTRA_DC:
-+ s->hpc.pred_dc[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride);
-+ break;
-+ case INTRA_ANGULAR_HORIZONTAL:
-+ s->hpc.pred_horizontal[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride,
-+ mode);
-+ break;
-+ case INTRA_ANGULAR_VERTICAL:
-+ s->hpc.pred_vertical[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride,
-+ mode);
-+ break;
-+ default:
-+ s->hpc.pred_angular[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride,
-+ mode);
-+ break;
-+ }
-+#else
-+ switch (mode) {
-+ case INTRA_PLANAR:
-+ s->hpc.pred_planar_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride);
-+ break;
-+ case INTRA_DC:
-+ s->hpc.pred_dc_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride);
-+ break;
-+ case INTRA_ANGULAR_HORIZONTAL:
-+ s->hpc.pred_horizontal_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride,
-+ mode);
-+ break;
-+ case INTRA_ANGULAR_VERTICAL:
-+ s->hpc.pred_vertical_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride,
-+ mode);
-+ break;
-+ default:
-+ s->hpc.pred_angular_c[log2_size - 2]((uint8_t *)src, (uint8_t *)top_pred,
-+ (uint8_t *)left, stride,
-+ mode);
-+ break;
-+ }
-+
-+#if DUMP_PRED
-+ printf("U pred @ %d, %d: mode=%d\n", x, y, mode);
-+ dump_pred_uv((uint8_t *)src, stride, 1 << log2_size);
-+ printf("V pred @ %d, %d: mode=%d\n", x, y, mode);
-+ dump_pred_uv((uint8_t *)src + 1, stride, 1 << log2_size);
-+#endif
-+#endif
-+}
-+
-+#define INTRA_PRED(log2_size) \
-+static void FUNC(intra_pred_ ## log2_size)(const struct HEVCRpiContext * const s, \
-+ const enum IntraPredMode mode, const unsigned int x0, const unsigned int y0, const unsigned int avail) \
-+{ \
-+ FUNC(intra_pred)(s, mode, x0, y0, avail, log2_size); \
-+}
-+
-+INTRA_PRED(2)
-+INTRA_PRED(3)
-+INTRA_PRED(4)
-+INTRA_PRED(5)
-+
-+#undef INTRA_PRED
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_planar)(uint8_t *_src, const uint8_t *_top,
-+ const uint8_t *_left, ptrdiff_t stride,
-+ int trafo_size)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ const pixel *top = (const pixel *)_top;
-+ const pixel *left = (const pixel *)_left;
-+ int size = 1 << trafo_size;
-+ for (y = 0; y < size; y++)
-+ for (x = 0; x < size; x++)
-+ POS(x, y) = ((size - 1 - x) * left[y] + (x + 1) * top[size] +
-+ (size - 1 - y) * top[x] + (y + 1) * left[size] + size) >> (trafo_size + 1);
-+}
-+#else
-+static av_always_inline void FUNC(pred_planar)(uint8_t * _src, const uint8_t * _top,
-+ const uint8_t * _left, ptrdiff_t stride,
-+ int trafo_size)
-+{
-+ int x, y;
-+ int size = 1 << trafo_size;
-+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+ const c_src_ptr_t top = (c_src_ptr_t)_top;
-+ const c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+ for (y = 0; y < size; y++, src += stride)
-+ {
-+ for (x = 0; x < size; x++)
-+ {
-+ src[x][0] = ((size - 1 - x) * left[y][0] + (x + 1) * top[size][0] +
-+ (size - 1 - y) * top[x][0] + (y + 1) * left[size][0] + size) >> (trafo_size + 1);
-+ src[x][1] = ((size - 1 - x) * left[y][1] + (x + 1) * top[size][1] +
-+ (size - 1 - y) * top[x][1] + (y + 1) * left[size][1] + size) >> (trafo_size + 1);
-+ }
-+ }
-+}
-+#endif
-+
-+#define PRED_PLANAR(size)\
-+static void FUNC(pred_planar_ ## size)(uint8_t *src, const uint8_t *top, \
-+ const uint8_t *left, ptrdiff_t stride) \
-+{ \
-+ FUNC(pred_planar)(src, top, left, stride, size + 2); \
-+}
-+
-+PRED_PLANAR(0)
-+PRED_PLANAR(1)
-+PRED_PLANAR(2)
-+PRED_PLANAR(3)
-+
-+#undef PRED_PLANAR
-+
-+#if !PRED_C
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+ const uint8_t *_left,
-+ ptrdiff_t stride, int log2_size)
-+{
-+ int i, j, x, y;
-+ int size = (1 << log2_size);
-+ pixel *src = (pixel *)_src;
-+ const pixel *top = (const pixel *)_top;
-+ const pixel *left = (const pixel *)_left;
-+ int dc = size;
-+ pixel4 a;
-+ for (i = 0; i < size; i++)
-+ dc += left[i] + top[i];
-+
-+ dc >>= log2_size + 1;
-+
-+ a = PIXEL_SPLAT_X4(dc);
-+
-+ for (i = 0; i < size; i++)
-+ for (j = 0; j < size; j+=4)
-+ AV_WN4P(&POS(j, i), a);
-+
-+// if (c_idx == 0 && size < 32)
-+// As we now have separate fns for y & c - no need to test that
-+ if (size < 32)
-+ {
-+ POS(0, 0) = (left[0] + 2 * dc + top[0] + 2) >> 2;
-+ for (x = 1; x < size; x++)
-+ POS(x, 0) = (top[x] + 3 * dc + 2) >> 2;
-+ for (y = 1; y < size; y++)
-+ POS(0, y) = (left[y] + 3 * dc + 2) >> 2;
-+ }
-+}
-+#else
-+static void FUNC(pred_dc)(uint8_t *_src, const uint8_t *_top,
-+ const uint8_t *_left,
-+ ptrdiff_t stride, int log2_size)
-+{
-+ unsigned int i, j;
-+ const unsigned int size = (1 << log2_size);
-+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+ const c_src_ptr_t top = (c_src_ptr_t)_top;
-+ const c_src_ptr_t left = (c_src_ptr_t)_left;
-+ unsigned int dc0 = size;
-+ unsigned int dc1 = size;
-+
-+ for (i = 0; i < size; i++)
-+ {
-+ dc0 += left[i][0] + top[i][0];
-+ dc1 += left[i][1] + top[i][1];
-+ }
-+
-+ dc0 >>= log2_size + 1;
-+ dc1 >>= log2_size + 1;
-+
-+ for (i = 0; i < size; i++, src += stride)
-+ {
-+ for (j = 0; j < size; ++j)
-+ {
-+ src[j][0] = dc0;
-+ src[j][1] = dc1;
-+
-+ }
-+ }
-+}
-+#endif
-+
-+#define PRED_DC(size)\
-+static void FUNC(pred_dc_ ## size)(uint8_t *src, const uint8_t *top, \
-+ const uint8_t *left, ptrdiff_t stride) \
-+{ \
-+ FUNC(pred_dc)(src, top, left, stride, size + 2); \
-+}
-+
-+PRED_DC(0)
-+PRED_DC(1)
-+PRED_DC(2)
-+PRED_DC(3)
-+
-+#undef PRED_DC
-+
-+
-+
-+
-+#if !PRED_C
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+ int i, j;
-+ int size = (1 << log2_size);
-+ pixel *src = (pixel *)_src;
-+ pixel4 a = PIXEL_SPLAT_X4(1 << (BIT_DEPTH - 1));
-+
-+ for (i = 0; i < size; i++)
-+ for (j = 0; j < size; j+=4)
-+ AV_WN4P(&POS(j, i), a);
-+}
-+#else
-+static void FUNC(pred_dc0)(uint8_t *_src, ptrdiff_t stride, int log2_size)
-+{
-+ unsigned int i, j;
-+ const unsigned int size = (1 << log2_size);
-+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+ const pixel a = (1 << (BIT_DEPTH - 1));
-+
-+ for (i = 0; i < size; i++, src += stride)
-+ {
-+ for (j = 0; j < size; ++j)
-+ {
-+ src[j][0] = a;
-+ src[j][1] = a;
-+ }
-+ }
-+}
-+#endif
-+
-+#define PRED_DC0(size)\
-+static void FUNC(pred_dc0_ ## size)(uint8_t *src, ptrdiff_t stride) \
-+{ \
-+ FUNC(pred_dc0)(src, stride, size + 2); \
-+}
-+
-+PRED_DC0(0)
-+PRED_DC0(1)
-+PRED_DC0(2)
-+PRED_DC0(3)
-+
-+#undef PRED_DC0
-+
-+
-+
-+
-+#ifndef ANGLE_CONSTS
-+#define ANGLE_CONSTS
-+static const int intra_pred_angle[] = {
-+ 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26, -32,
-+ -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
-+};
-+static const int inv_angle[] = {
-+ -4096, -1638, -910, -630, -482, -390, -315, -256, -315, -390, -482,
-+ -630, -910, -1638, -4096
-+};
-+#endif
-+
-+#if !PRED_C
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+ const uint8_t *_top,
-+ const uint8_t *_left,
-+ ptrdiff_t stride,
-+ int mode, int size)
-+{
-+ int x, y;
-+ pixel *src = (pixel *)_src;
-+ const pixel *top = (const pixel *)_top;
-+ const pixel *left = (const pixel *)_left;
-+
-+ int angle = intra_pred_angle[mode - 2];
-+ pixel ref_array[3 * MAX_TB_SIZE + 4];
-+ pixel *ref_tmp = ref_array + size;
-+ const pixel *ref;
-+ int last = (size * angle) >> 5;
-+
-+ if (mode >= 18) {
-+ ref = top - 1;
-+ if (angle < 0 && last < -1) {
-+ for (x = 0; x <= size; x += 4)
-+ AV_WN4P(&ref_tmp[x], AV_RN4P(&top[x - 1]));
-+ for (x = last; x <= -1; x++)
-+ ref_tmp[x] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+ ref = ref_tmp;
-+ }
-+
-+ for (y = 0; y < size; y++) {
-+ int idx = ((y + 1) * angle) >> 5;
-+ int fact = ((y + 1) * angle) & 31;
-+ if (fact) {
-+ for (x = 0; x < size; x += 4) {
-+ POS(x , y) = ((32 - fact) * ref[x + idx + 1] +
-+ fact * ref[x + idx + 2] + 16) >> 5;
-+ POS(x + 1, y) = ((32 - fact) * ref[x + 1 + idx + 1] +
-+ fact * ref[x + 1 + idx + 2] + 16) >> 5;
-+ POS(x + 2, y) = ((32 - fact) * ref[x + 2 + idx + 1] +
-+ fact * ref[x + 2 + idx + 2] + 16) >> 5;
-+ POS(x + 3, y) = ((32 - fact) * ref[x + 3 + idx + 1] +
-+ fact * ref[x + 3 + idx + 2] + 16) >> 5;
-+ }
-+ } else {
-+ for (x = 0; x < size; x += 4)
-+ AV_WN4P(&POS(x, y), AV_RN4P(&ref[x + idx + 1]));
-+ }
-+ }
-+// if (mode == 26 && c_idx == 0 && size < 32) {
-+ if (mode == 26 && size < 32) {
-+ for (y = 0; y < size; y++)
-+ POS(0, y) = av_clip_pixel(top[0] + ((left[y] - left[-1]) >> 1));
-+ }
-+
-+ } else {
-+ ref = left - 1;
-+ if (angle < 0 && last < -1) {
-+ for (x = 0; x <= size; x += 4)
-+ AV_WN4P(&ref_tmp[x], AV_RN4P(&left[x - 1]));
-+ for (x = last; x <= -1; x++)
-+ ref_tmp[x] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)];
-+ ref = ref_tmp;
-+ }
-+
-+ for (x = 0; x < size; x++) {
-+ int idx = ((x + 1) * angle) >> 5;
-+ int fact = ((x + 1) * angle) & 31;
-+ if (fact) {
-+ for (y = 0; y < size; y++) {
-+ POS(x, y) = ((32 - fact) * ref[y + idx + 1] +
-+ fact * ref[y + idx + 2] + 16) >> 5;
-+ }
-+ } else {
-+ for (y = 0; y < size; y++)
-+ POS(x, y) = ref[y + idx + 1];
-+ }
-+ }
-+// if (mode == 10 && c_idx == 0 && size < 32) {
-+ if (mode == 10 && size < 32) {
-+ for (x = 0; x < size; x += 4) {
-+ POS(x, 0) = av_clip_pixel(left[0] + ((top[x ] - top[-1]) >> 1));
-+ POS(x + 1, 0) = av_clip_pixel(left[0] + ((top[x + 1] - top[-1]) >> 1));
-+ POS(x + 2, 0) = av_clip_pixel(left[0] + ((top[x + 2] - top[-1]) >> 1));
-+ POS(x + 3, 0) = av_clip_pixel(left[0] + ((top[x + 3] - top[-1]) >> 1));
-+ }
-+ }
-+ }
-+
-+
-+
-+#if BIT_DEPTH == 8 && 0
-+ if ((size == 16 || size == 32) && mode != 10 && mode != 26) {
-+ DECLARE_ALIGNED(16, uint8_t, a[64*32]);
-+ void ff_hevc_rpi_pred_angular_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+// void ff_hevc_rpi_pred_angular_32_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+#if 1
-+ src = (pixel *)_src;
-+ printf("C: Mode=%d\n", mode);
-+ for (y = 0; y < size; y++, src += stride)
-+ {
-+ printf("%2d: ", y);
-+ for (x = 0; x < size; x++)
-+ {
-+ printf("%3x ", src[x]);
-+ }
-+ printf("\n");
-+ }
-+#endif
-+// ff_hevc_rpi_pred_vertical_16_neon_8(a, _top, _left, size);
-+ memset(a, 0, sizeof(a));
-+// ff_hevc_rpi_pred_angular_32_neon_10(a, _top, _left, size, mode);
-+ ff_hevc_rpi_pred_angular_16_neon_8(a, _top, _left, size, mode);
-+#if 1
-+ src = (pixel *)a;
-+ printf("A:\n");
-+ for (y = 0; y < size; y++, src += size)
-+ {
-+ printf("%2d: ", y);
-+ for (x = 0; x < size; x++)
-+ {
-+ printf("%3x ", src[x]);
-+ }
-+ printf("\n");
-+ }
-+#endif
-+ src = (pixel *)_src;
-+ for (y = 0; y < size; y++, src += stride)
-+ {
-+ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
-+ printf("Fail at line %d\n", y);
-+ av_assert0(0);
-+ }
-+ }
-+ }
-+#endif
-+
-+}
-+#else
-+static av_always_inline void FUNC(pred_angular)(uint8_t *_src,
-+ const uint8_t *_top,
-+ const uint8_t *_left,
-+ ptrdiff_t stride,
-+ int mode, int size)
-+{
-+ int x, y;
-+ c_dst_ptr_t src = (c_dst_ptr_t)_src;
-+ c_src_ptr_t top = (c_src_ptr_t)_top;
-+ c_src_ptr_t left = (c_src_ptr_t)_left;
-+
-+ const int angle = intra_pred_angle[mode - 2];
-+ cpel ref_array[3 * MAX_TB_SIZE + 4][2];
-+ c_dst_ptr_t ref_tmp = ref_array + size;
-+ c_src_ptr_t ref;
-+ const int last = (size * angle) >> 5;
-+
-+ if (mode >= 18) {
-+ ref = top - 1;
-+ if (angle < 0 && last < -1) {
-+ memcpy(ref_tmp, top - 1, (size + 1) * 2 * PW);
-+ for (x = last; x <= -1; x++)
-+ {
-+ ref_tmp[x][0] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+ ref_tmp[x][1] = left[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+ }
-+ ref = (c_src_ptr_t)ref_tmp;
-+ }
-+
-+ for (y = 0; y < size; y++, src += stride) {
-+ const int idx = ((y + 1) * angle) >> 5;
-+ const int fact = ((y + 1) * angle) & 31;
-+ if (fact) {
-+ for (x = 0; x < size; ++x) {
-+ src[x][0] = ((32 - fact) * ref[x + idx + 1][0] +
-+ fact * ref[x + idx + 2][0] + 16) >> 5;
-+ src[x][1] = ((32 - fact) * ref[x + idx + 1][1] +
-+ fact * ref[x + idx + 2][1] + 16) >> 5;
-+ }
-+ } else {
-+ memcpy(src, ref + idx + 1, size * 2 * PW);
-+ }
-+ }
-+ } else {
-+ ref = left - 1;
-+ if (angle < 0 && last < -1) {
-+ memcpy(ref_tmp, left - 1, (size + 1) * 2 * PW);
-+ for (x = last; x <= -1; x++)
-+ {
-+ ref_tmp[x][0] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][0];
-+ ref_tmp[x][1] = top[-1 + ((x * inv_angle[mode - 11] + 128) >> 8)][1];
-+ }
-+ ref = (c_src_ptr_t)ref_tmp;
-+ }
-+
-+ for (x = 0; x < size; x++, src++) {
-+ const int idx = ((x + 1) * angle) >> 5;
-+ const int fact = ((x + 1) * angle) & 31;
-+ if (fact) {
-+ for (y = 0; y < size; y++) {
-+ src[y * stride][0] = ((32 - fact) * ref[y + idx + 1][0] +
-+ fact * ref[y + idx + 2][0] + 16) >> 5;
-+ src[y * stride][1] = ((32 - fact) * ref[y + idx + 1][1] +
-+ fact * ref[y + idx + 2][1] + 16) >> 5;
-+ }
-+ } else {
-+ for (y = 0; y < size; y++)
-+ {
-+ src[y * stride][0] = ref[y + idx + 1][0];
-+ src[y * stride][1] = ref[y + idx + 1][1];
-+ }
-+ }
-+ }
-+ }
-+
-+#if BIT_DEPTH == 10 && 0
-+ if (size == 16 && mode != 10 && mode != 26) {
-+ DECLARE_ALIGNED(16, uint8_t, a[64*32]);
-+// void ff_hevc_rpi_pred_vertical_16_neon_8(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride);
-+ void ff_hevc_rpi_pred_angular_c_16_neon_10(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int mode);
-+
-+ src = (c_dst_ptr_t)_src;
-+ printf("C: mode=%d\n", mode);
-+ for (y = 0; y < size; y++, src += stride)
-+ {
-+ for (x = 0; x < size; x++)
-+ {
-+ printf("%3x:%3x ", src[x][0], src[x][1]);
-+ }
-+ printf("\n");
-+ }
-+
-+ memset(a, 0, sizeof(a));
-+ ff_hevc_rpi_pred_angular_c_16_neon_10(a, _top, _left, size, mode);
-+
-+ src = (c_dst_ptr_t)a;
-+ printf("A:\n");
-+ for (y = 0; y < size; y++, src += size)
-+ {
-+ for (x = 0; x < size; x++)
-+ {
-+ printf("%3x:%3x ", src[x][0], src[x][1]);
-+ }
-+ printf("\n");
-+ }
-+
-+ src = (c_dst_ptr_t)_src;
-+ for (y = 0; y < size; y++, src += stride)
-+ {
-+ if (memcmp(src, a + size * sizeof(pixel) * y, size * sizeof(pixel)) != 0) {
-+ printf("Fail at line %d\n", y);
-+ av_assert0(0);
-+ }
-+ }
-+
-+ }
-+#endif
-+}
-+#endif
-+
-+static void FUNC(pred_angular_0)(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left,
-+ ptrdiff_t stride, int mode)
-+{
-+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 2);
-+}
-+
-+static void FUNC(pred_angular_1)(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left,
-+ ptrdiff_t stride, int mode)
-+{
-+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 3);
-+}
-+
-+static void FUNC(pred_angular_2)(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left,
-+ ptrdiff_t stride, int mode)
-+{
-+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 4);
-+}
-+
-+static void FUNC(pred_angular_3)(uint8_t *src, const uint8_t *top,
-+ const uint8_t *left,
-+ ptrdiff_t stride, int mode)
-+{
-+ FUNC(pred_angular)(src, top, left, stride, mode, 1 << 5);
-+}
-+
-+#undef cpel
-+#undef c_src_ptr_t
-+#undef c_dst_ptr_t
-+
-+#undef EXTEND
-+#undef POS
-+#undef PW
-+
-+#undef filter_light1
-+#undef filter_light
-+#undef filter_strong
-+#undef ref_gen
-+
-+#ifndef INCLUDED_ONCE
-+#define INCLUDED_ONCE
-+#endif
-+
-diff --git a/libavcodec/rpi_mailbox.c b/libavcodec/rpi_mailbox.c
-new file mode 100644
-index 0000000000..20f218f22c
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.c
-@@ -0,0 +1,107 @@
-+/*
-+Copyright (c) 2012, Broadcom Europe Ltd.
-+All rights reserved.
-+
-+Redistribution and use in source and binary forms, with or without
-+modification, are permitted provided that the following conditions are met:
-+ * Redistributions of source code must retain the above copyright
-+ notice, this list of conditions and the following disclaimer.
-+ * Redistributions in binary form must reproduce the above copyright
-+ notice, this list of conditions and the following disclaimer in the
-+ documentation and/or other materials provided with the distribution.
-+ * Neither the name of the copyright holder nor the
-+ names of its contributors may be used to endorse or promote products
-+ derived from this software without specific prior written permission.
-+
-+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
-+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-+*/
-+
-+#include <stdio.h>
-+#include <string.h>
-+#include <stdlib.h>
-+#include <fcntl.h>
-+#include <unistd.h>
-+#include <assert.h>
-+#include <stdint.h>
-+#include <sys/ioctl.h>
-+
-+#include <linux/ioctl.h>
-+
-+#define MAJOR_NUM 100
-+#define IOCTL_MBOX_PROPERTY _IOWR(MAJOR_NUM, 0, char *)
-+#define DEVICE_FILE_NAME "/dev/vcio"
-+
-+#include "rpi_mailbox.h"
-+//#include <interface/vctypes/vc_image_structs.h>
-+
-+/*
-+ * use ioctl to send mbox property message
-+ */
-+
-+static int mbox_property(int file_desc, void *buf)
-+{
-+ int ret_val = ioctl(file_desc, IOCTL_MBOX_PROPERTY, buf);
-+
-+ if (ret_val < 0) {
-+ printf("ioctl_set_msg failed:%d\n", ret_val);
-+ }
-+
-+#ifdef DEBUG
-+ unsigned *p = buf; int i; unsigned size = *(unsigned *)buf;
-+ for (i=0; i<size/4; i++)
-+ printf("%04x: 0x%08x\n", i*sizeof *p, p[i]);
-+#endif
-+ return ret_val;
-+}
-+
-+#define GET_VCIMAGE_PARAMS 0x30044
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img)
-+{
-+ uint32_t buf[sizeof(*img) / sizeof(uint32_t) + 32];
-+ uint32_t * p = buf;
-+ void * rimg;
-+ int rv;
-+
-+ *p++ = 0; // size
-+ *p++ = 0; // process request
-+ *p++ = GET_VCIMAGE_PARAMS;
-+ *p++ = sizeof(*img);
-+ *p++ = sizeof(*img);
-+ rimg = p;
-+ memcpy(p, img, sizeof(*img));
-+ p += sizeof(*img) / sizeof(*p);
-+ *p++ = 0; // End tag
-+ buf[0] = (p - buf) * sizeof(*p);
-+
-+ rv = mbox_property(fd, buf);
-+ memcpy(img, rimg, sizeof(*img));
-+
-+ return rv;
-+}
-+
-+int mbox_open() {
-+ int file_desc;
-+
-+ // open a char device file used for communicating with kernel mbox driver
-+ file_desc = open(DEVICE_FILE_NAME, 0);
-+ if (file_desc < 0) {
-+ printf("Can't open device file: %s\n", DEVICE_FILE_NAME);
-+ printf("Try creating a device file with: sudo mknod %s c %d 0\n", DEVICE_FILE_NAME, MAJOR_NUM);
-+ }
-+ return file_desc;
-+}
-+
-+void mbox_close(int file_desc) {
-+ close(file_desc);
-+}
-+
-diff --git a/libavcodec/rpi_mailbox.h b/libavcodec/rpi_mailbox.h
-new file mode 100644
-index 0000000000..06709d57fd
---- /dev/null
-+++ b/libavcodec/rpi_mailbox.h
-@@ -0,0 +1,55 @@
-+#ifndef RPI_MAILBOX_H
-+#define RPI_MAILBOX_H
-+
-+/* The image structure. */
-+typedef struct vc_image_extra_uv_s {
-+ void *u, *v;
-+ int vpitch;
-+} VC_IMAGE_EXTRA_UV_T;
-+
-+typedef union {
-+ VC_IMAGE_EXTRA_UV_T uv;
-+// VC_IMAGE_EXTRA_RGBA_T rgba;
-+// VC_IMAGE_EXTRA_PAL_T pal;
-+// VC_IMAGE_EXTRA_TF_T tf;
-+// VC_IMAGE_EXTRA_BAYER_T bayer;
-+// VC_IMAGE_EXTRA_MSBAYER_T msbayer;
-+// VC_IMAGE_EXTRA_CODEC_T codec;
-+// VC_IMAGE_EXTRA_OPENGL_T opengl;
-+} VC_IMAGE_EXTRA_T;
-+
-+
-+typedef struct VC_IMAGE_T {
-+ unsigned short type; /* should restrict to 16 bits */
-+ unsigned short info; /* format-specific info; zero for VC02 behaviour */
-+ unsigned short width; /* width in pixels */
-+ unsigned short height; /* height in pixels */
-+ int pitch; /* pitch of image_data array in bytes */
-+ int size; /* number of bytes available in image_data array */
-+ void *image_data; /* pixel data */
-+ VC_IMAGE_EXTRA_T extra; /* extra data like palette pointer */
-+ void *metadata; /* metadata header for the image */
-+ void *pool_object; /* nonNULL if image was allocated from a vc_pool */
-+ int mem_handle; /* the mem handle for relocatable memory storage */
-+ int metadata_size; /* size of metadata of each channel in bytes */
-+ int channel_offset; /* offset of consecutive channels in bytes */
-+ uint32_t video_timestamp;/* 90000 Hz RTP times domain - derived from audio timestamp */
-+ uint8_t num_channels; /* number of channels (2 for stereo) */
-+ uint8_t current_channel;/* the channel this header is currently pointing to */
-+ uint8_t linked_multichann_flag;/* Indicate the header has the linked-multichannel structure*/
-+ uint8_t is_channel_linked; /* Track if the above structure is been used to link the header
-+ into a linked-mulitchannel image */
-+ uint8_t channel_index; /* index of the channel this header represents while
-+ it is being linked. */
-+ uint8_t _dummy[3]; /* pad struct to 64 bytes */
-+} VC_IMAGE_T;
-+
-+typedef int vc_image_t_size_check[(sizeof(VC_IMAGE_T) == 64) * 2 - 1];
-+
-+
-+extern int mbox_open(void);
-+extern void mbox_close(int file_desc);
-+
-+int mbox_get_image_params(int fd, VC_IMAGE_T * img);
-+
-+#endif
-diff --git a/libavcodec/rpi_qpu.c b/libavcodec/rpi_qpu.c
-new file mode 100644
-index 0000000000..f4498bf7b1
---- /dev/null
-+++ b/libavcodec/rpi_qpu.c
-@@ -0,0 +1,957 @@
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <stddef.h>
-+#include <stdint.h>
-+#include "libavutil/avassert.h"
-+
-+#include "config.h"
-+
-+#include <pthread.h>
-+#include <time.h>
-+
-+#include <interface/vcsm/user-vcsm.h>
-+
-+#include "rpi_mailbox.h"
-+#include "rpi_qpu.h"
-+#include "rpi_hevc_shader.h"
-+#include "rpi_hevc_transform8.h"
-+#include "rpi_hevc_transform10.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+// Trace time spent waiting for GPU (VPU/QPU) (1=Yes, 0=No)
-+#define RPI_TRACE_TIME_VPU_QPU_WAIT 0
-+
-+// Add profile flags to all QPU requests - generates output in "vcdbg log msg"
-+// Beware this is expensive and will probably throw off all other timing by >10%
-+#define RPI_TRACE_QPU_PROFILE_ALL 0
-+
-+// QPU "noflush" flags
-+// a mixture of flushing & profiling
-+
-+#define QPU_FLAGS_NO_FLUSH_VPU 1 // If unset VPU cache will be flushed
-+#define QPU_FLAGS_PROF_CLEAR_AND_ENABLE 2 // Clear & Enable detailed QPU profiling registers
-+#define QPU_FLAGS_PROF_OUTPUT_COUNTS 4 // Print the results
-+#define QPU_FLAGS_OUTPUT_QPU_TIMES 8 // Print QPU times - independant of the profiling
-+#define QPU_FLAGS_NO_FLUSH_QPU 16 // If unset flush QPU caches & TMUs (uniforms always flushed)
-+
-+#define vcos_verify_ge0(x) ((x)>=0)
-+
-+// Size in 32bit words
-+#define QPU_CODE_SIZE 4098
-+#define VPU_CODE_SIZE 16384
-+
-+static const short rpi_transMatrix2even[32][16] = { // Even rows first
-+{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
-+{90, 87, 80, 70, 57, 43, 25, 9, -9, -25, -43, -57, -70, -80, -87, -90},
-+{89, 75, 50, 18, -18, -50, -75, -89, -89, -75, -50, -18, 18, 50, 75, 89},
-+{87, 57, 9, -43, -80, -90, -70, -25, 25, 70, 90, 80, 43, -9, -57, -87},
-+{83, 36, -36, -83, -83, -36, 36, 83, 83, 36, -36, -83, -83, -36, 36, 83},
-+{80, 9, -70, -87, -25, 57, 90, 43, -43, -90, -57, 25, 87, 70, -9, -80},
-+{75, -18, -89, -50, 50, 89, 18, -75, -75, 18, 89, 50, -50, -89, -18, 75},
-+{70, -43, -87, 9, 90, 25, -80, -57, 57, 80, -25, -90, -9, 87, 43, -70},
-+{64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64, 64, -64, -64, 64},
-+{57, -80, -25, 90, -9, -87, 43, 70, -70, -43, 87, 9, -90, 25, 80, -57},
-+{50, -89, 18, 75, -75, -18, 89, -50, -50, 89, -18, -75, 75, 18, -89, 50},
-+{43, -90, 57, 25, -87, 70, 9, -80, 80, -9, -70, 87, -25, -57, 90, -43},
-+{36, -83, 83, -36, -36, 83, -83, 36, 36, -83, 83, -36, -36, 83, -83, 36},
-+{25, -70, 90, -80, 43, 9, -57, 87, -87, 57, -9, -43, 80, -90, 70, -25},
-+{18, -50, 75, -89, 89, -75, 50, -18, -18, 50, -75, 89, -89, 75, -50, 18},
-+{ 9, -25, 43, -57, 70, -80, 87, -90, 90, -87, 80, -70, 57, -43, 25, -9},
-+// Odd rows
-+{90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4},
-+{90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13},
-+{88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22},
-+{85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31},
-+{82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38},
-+{78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46},
-+{73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54},
-+{67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61},
-+{61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67},
-+{54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73},
-+{46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78},
-+{38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82},
-+{31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85},
-+{22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88},
-+{13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90},
-+{ 4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90}
-+};
-+
-+// Code/constants on GPU
-+struct GPU
-+{
-+// unsigned int qpu_code[QPU_CODE_SIZE];
-+ unsigned int vpu_code8[VPU_CODE_SIZE];
-+ unsigned int vpu_code10[VPU_CODE_SIZE];
-+ short transMatrix2even[16*16*2];
-+};
-+
-+struct rpi_cache_flush_env_s {
-+ struct vcsm_user_clean_invalid2_s v;
-+};
-+
-+#define WAIT_COUNT_MAX 16
-+
-+typedef struct trace_time_one_s
-+{
-+ int count;
-+ int64_t start[WAIT_COUNT_MAX];
-+ int64_t total[WAIT_COUNT_MAX];
-+} trace_time_one_t;
-+
-+typedef struct trace_time_wait_s
-+{
-+ unsigned int jcount;
-+ int64_t start0;
-+ int64_t last_update;
-+ trace_time_one_t active;
-+ trace_time_one_t wait;
-+} trace_time_wait_t;
-+
-+typedef struct vq_wait_s
-+{
-+ sem_t sem;
-+ struct vq_wait_s * next;
-+} vq_wait_t;
-+
-+#define VQ_WAIT_POOL_SIZE 16
-+typedef struct vq_wait_pool_s
-+{
-+ vq_wait_t * head;
-+ vq_wait_t pool[VQ_WAIT_POOL_SIZE];
-+} vq_wait_pool_t;
-+
-+static void vq_wait_pool_init(vq_wait_pool_t * const pool);
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const pool);
-+
-+typedef struct gpu_env_s
-+{
-+ int open_count;
-+ int init_count;
-+ int mb;
-+ int vpu_i_cache_flushed;
-+ GPU_MEM_PTR_T qpu_code_gm_ptr;
-+ GPU_MEM_PTR_T code_gm_ptr;
-+ GPU_MEM_PTR_T dummy_gm_ptr;
-+ vq_wait_pool_t wait_pool;
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ trace_time_wait_t ttw;
-+#endif
-+} gpu_env_t;
-+
-+// Stop more than one thread trying to allocate memory or use the processing resources at once
-+static pthread_mutex_t gpu_mutex = PTHREAD_MUTEX_INITIALIZER;
-+static gpu_env_t * gpu = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+
-+static int64_t ns_time(void)
-+{
-+ struct timespec ts;
-+ clock_gettime(CLOCK_MONOTONIC, &ts);
-+ return (int64_t)ts.tv_sec * (int64_t)1000000000 + ts.tv_nsec;
-+}
-+
-+
-+#define WAIT_TIME_PRINT_PERIOD (int64_t)2000000000
-+
-+#define T_MS(t) ((unsigned int)((t)/(int64_t)1000000) % 1000U)
-+#define T_SEC(t) (unsigned int)((t)/(int64_t)1000000000)
-+#define T_ARG(t) T_SEC(t), T_MS(t)
-+#define T_FMT "%u.%03u"
-+
-+static void tto_print(trace_time_one_t * tto, const int64_t now, const int64_t start0, const char * const prefix)
-+{
-+ // Update totals for levels that are still pending
-+ for (int i = 0; i < tto->count; ++i) {
-+ tto->total[i] += now - tto->start[i];
-+ tto->start[i] = now;
-+ }
-+
-+ printf("%s: Idle:" T_FMT ", 1:" T_FMT ", 2:" T_FMT ", 3:" T_FMT ", 4:" T_FMT "\n",
-+ prefix,
-+ T_ARG(now - start0 - tto->total[0]),
-+ T_ARG(tto->total[0]),
-+ T_ARG(tto->total[1]),
-+ T_ARG(tto->total[2]),
-+ T_ARG(tto->total[3]));
-+}
-+
-+
-+static void tto_start(trace_time_one_t * const tto, const int64_t now)
-+{
-+ av_assert0(tto->count < WAIT_COUNT_MAX);
-+ tto->start[tto->count++] = now;
-+}
-+
-+static void tto_end(trace_time_one_t * const tto, const int64_t now)
-+{
-+ const int n = --tto->count;
-+ av_assert0(n >= 0);
-+ tto->total[n] += now - tto->start[n];
-+}
-+
-+static void ttw_print(trace_time_wait_t * const ttw, const int64_t now)
-+{
-+ printf("Jobs:%d, Total time=" T_FMT "\n", ttw->jcount, T_ARG(now - ttw->start0));
-+ tto_print(&ttw->active, now, ttw->start0, "Active");
-+ tto_print(&ttw->wait, now, ttw->start0, " Wait");
-+}
-+
-+#endif
-+
-+// GPU memory alloc fns (internal)
-+
-+static void gpu_free_internal(GPU_MEM_PTR_T * const p)
-+{
-+ if (p->arm != NULL)
-+ vcsm_unlock_ptr(p->arm);
-+ if (p->vcsm_handle != 0)
-+ vcsm_free(p->vcsm_handle);
-+ memset(p, 0, sizeof(*p)); // Ensure we crash hard if we try and use this again
-+}
-+
-+
-+static int gpu_malloc_internal(GPU_MEM_PTR_T * const p,
-+ const int numbytes, const unsigned int cache_type, const char * const name)
-+{
-+ memset(p, 0, sizeof(*p));
-+ p->numbytes = (numbytes + 255) & ~255; // Round up
-+
-+ if ((p->vcsm_handle = vcsm_malloc_cache(p->numbytes, cache_type | 0x80, (char *)name)) == 0 ||
-+ (p->vc_handle = vcsm_vc_hdl_from_hdl(p->vcsm_handle)) == 0 ||
-+ (p->arm = vcsm_lock(p->vcsm_handle)) == NULL ||
-+ (p->vc = vcsm_vc_addr_from_hdl(p->vcsm_handle)) == 0)
-+ {
-+ gpu_free_internal(p);
-+ return AVERROR(ENOMEM);
-+ }
-+ return 0;
-+}
-+
-+
-+// GPU init, free, lock, unlock
-+
-+static void gpu_term(void)
-+{
-+ gpu_env_t * const ge = gpu;
-+
-+ // We have to hope that eveything has terminated...
-+ gpu = NULL;
-+
-+ vc_gpuserv_deinit();
-+
-+ gpu_free_internal(&ge->code_gm_ptr);
-+ gpu_free_internal(&ge->qpu_code_gm_ptr);
-+ gpu_free_internal(&ge->dummy_gm_ptr);
-+
-+ vcsm_exit();
-+
-+ mbox_close(ge->mb);
-+
-+ vq_wait_pool_deinit(&ge->wait_pool);
-+
-+ free(ge);
-+}
-+
-+
-+// Connect to QPU, returns 0 on success.
-+static int gpu_init(gpu_env_t ** const gpu) {
-+ volatile struct GPU* ptr;
-+ gpu_env_t * const ge = calloc(1, sizeof(gpu_env_t));
-+ int rv;
-+ *gpu = NULL;
-+
-+ if (ge == NULL)
-+ return -1;
-+
-+ if ((ge->mb = mbox_open()) < 0)
-+ return -1;
-+
-+ vq_wait_pool_init(&ge->wait_pool);
-+
-+ vcsm_init();
-+
-+ // Now copy over the QPU code into GPU memory
-+ if ((rv = gpu_malloc_internal(&ge->qpu_code_gm_ptr, QPU_CODE_SIZE * 4, VCSM_CACHE_TYPE_NONE, "ffmpeg qpu code")) != 0)
-+ return rv;
-+
-+ {
-+ int num_bytes = (char *)mc_end - (char *)ff_hevc_rpi_shader;
-+ av_assert0(num_bytes<=QPU_CODE_SIZE*sizeof(unsigned int));
-+ memcpy(ge->qpu_code_gm_ptr.arm, ff_hevc_rpi_shader, num_bytes);
-+ memset(ge->qpu_code_gm_ptr.arm + num_bytes, 0, QPU_CODE_SIZE*4 - num_bytes);
-+ }
-+
-+ // And the VPU code
-+ if ((rv = gpu_malloc_internal(&ge->code_gm_ptr, sizeof(struct GPU), VCSM_CACHE_TYPE_VC, "ffmpeg vpu code")) != 0)
-+ return rv;
-+ ptr = (volatile struct GPU*)ge->code_gm_ptr.arm;
-+
-+ // Zero everything so we have zeros between the code bits
-+ memset((void *)ptr, 0, sizeof(*ptr));
-+ {
-+ int num_bytes = sizeof(rpi_hevc_transform8);
-+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+ memcpy((void*)ptr->vpu_code8, rpi_hevc_transform8, num_bytes);
-+ }
-+ {
-+ int num_bytes = sizeof(rpi_hevc_transform10);
-+ av_assert0(num_bytes<=VPU_CODE_SIZE*sizeof(unsigned int));
-+ memcpy((void*)ptr->vpu_code10, rpi_hevc_transform10, num_bytes);
-+ }
-+ // And the transform coefficients
-+ memcpy((void*)ptr->transMatrix2even, rpi_transMatrix2even, sizeof(rpi_transMatrix2even));
-+
-+ // Generate a dummy "frame" & fill with 0x80
-+ // * Could reset to 1 <<bit_depth?
-+ if ((rv = gpu_malloc_internal(&ge->dummy_gm_ptr, 0x4000, VCSM_CACHE_TYPE_NONE, "ffmpeg dummy frame")) != 0)
-+ return rv;
-+ memset(ge->dummy_gm_ptr.arm, 0x80, 0x4000);
-+
-+ *gpu = ge;
-+ return 0;
-+}
-+
-+
-+
-+static void gpu_unlock(void) {
-+ pthread_mutex_unlock(&gpu_mutex);
-+}
-+
-+// Make sure we have exclusive access to the mailbox, and enable qpu if necessary.
-+static gpu_env_t * gpu_lock(void) {
-+ pthread_mutex_lock(&gpu_mutex);
-+
-+ av_assert1(gpu != NULL);
-+ return gpu;
-+}
-+
-+static gpu_env_t * gpu_lock_ref(void)
-+{
-+ pthread_mutex_lock(&gpu_mutex);
-+
-+ if (gpu == NULL) {
-+ int rv = gpu_init(&gpu);
-+ if (rv != 0) {
-+ gpu_unlock();
-+ return NULL;
-+ }
-+ }
-+
-+ ++gpu->open_count;
-+ return gpu;
-+}
-+
-+static void gpu_unlock_unref(gpu_env_t * const ge)
-+{
-+ if (--ge->open_count == 0)
-+ gpu_term();
-+
-+ gpu_unlock();
-+}
-+
-+static inline gpu_env_t * gpu_ptr(void)
-+{
-+ av_assert1(gpu != NULL);
-+ return gpu;
-+}
-+
-+// Public gpu fns
-+
-+// Allocate memory on GPU
-+// Fills in structure <p> containing ARM pointer, videocore handle, videocore memory address, numbytes
-+// Returns 0 on success.
-+// This allocates memory that will not be cached in ARM's data cache.
-+// Therefore safe to use without data cache flushing.
-+int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_NONE, "ffmpeg uncached");
-+}
-+
-+// This allocates data that will be
-+// Cached in ARM L2
-+// Uncached in VPU L2
-+int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p)
-+{
-+ return gpu_malloc_internal(p, numbytes, VCSM_CACHE_TYPE_HOST, "ffmpeg cached");
-+}
-+
-+void gpu_free(GPU_MEM_PTR_T * const p) {
-+ gpu_free_internal(p);
-+}
-+
-+unsigned int vpu_get_fn(const unsigned int bit_depth) {
-+ uint32_t a = 0;
-+
-+ // Make sure that the gpu is initialized
-+ av_assert1(gpu != NULL);
-+ switch (bit_depth){
-+ case 8:
-+ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code8);
-+ break;
-+ case 10:
-+ a = gpu->code_gm_ptr.vc + offsetof(struct GPU, vpu_code10);
-+ break;
-+ default:
-+ av_assert0(0);
-+ }
-+ return a;
-+}
-+
-+unsigned int vpu_get_constants(void) {
-+ av_assert1(gpu != NULL);
-+ return (gpu->code_gm_ptr.vc + offsetof(struct GPU,transMatrix2even));
-+}
-+
-+int gpu_get_mailbox(void)
-+{
-+ av_assert1(gpu);
-+ return gpu->mb;
-+}
-+
-+void gpu_ref(void)
-+{
-+ gpu_lock_ref();
-+ gpu_unlock();
-+}
-+
-+void gpu_unref(void)
-+{
-+ gpu_env_t * const ge = gpu_lock();
-+ gpu_unlock_unref(ge);
-+}
-+
-+// ----------------------------------------------------------------------------
-+//
-+// Cache flush functions
-+
-+#define CACHE_EL_MAX ((sizeof(rpi_cache_buf_t) - sizeof (struct vcsm_user_clean_invalid2_s)) / sizeof (struct vcsm_user_clean_invalid2_block_s))
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf)
-+{
-+ rpi_cache_flush_env_t * const rfe = (rpi_cache_flush_env_t *)buf;
-+ rfe->v.op_count = 0;
-+ return rfe;
-+}
-+
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe)
-+{
-+ // Nothing needed
-+}
-+
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe)
-+{
-+ int rc = 0;
-+ if (rfe->v.op_count != 0) {
-+ if (vcsm_clean_invalid2(&rfe->v) != 0)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "vcsm_clean_invalid2 failed: errno=%d\n", errno);
-+ rc = -1;
-+ }
-+ rfe->v.op_count = 0;
-+ }
-+ return rc;
-+}
-+
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe)
-+{
-+ int rc = rpi_cache_flush_execute(rfe);;
-+
-+ return rc;
-+}
-+
-+inline void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride)
-+{
-+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+
-+ av_assert1(rfe->v.op_count <= CACHE_EL_MAX);
-+
-+ b->invalidate_mode = mode;
-+ b->block_count = blocks;
-+ b->start_address = gm->arm + offset0;
-+ b->block_size = block_size;
-+ b->inter_block_stride = block_stride;
-+}
-+
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+ const unsigned int offset, const unsigned int size)
-+{
-+ // Deal with empty pointer trivially
-+ if (gm == NULL || size == 0)
-+ return;
-+
-+ av_assert1(offset <= gm->numbytes);
-+ av_assert1(size <= gm->numbytes);
-+ av_assert1(offset + size <= gm->numbytes);
-+
-+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, offset, size, 1, 0);
-+}
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode)
-+{
-+ rpi_cache_flush_add_gm_blocks(rfe, gm, mode, 0, gm->numbytes, 1, 0);
-+}
-+
-+
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode)
-+{
-+#if !RPI_ONE_BUF
-+#error Fixme! (NIF)
-+#endif
-+ if (gpu_is_buf1(frame)) {
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf1_gmem(frame), mode);
-+ }
-+ else
-+ {
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 0), mode);
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 1), mode);
-+ rpi_cache_flush_add_gm_ptr(rfe, gpu_buf3_gmem(frame, 2), mode);
-+ }
-+}
-+
-+// Flush an area of a frame
-+// Width, height, x0, y0 in luma pels
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const unsigned int mode,
-+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+ const unsigned int uv_shift, const int do_luma, const int do_chroma)
-+{
-+ const unsigned int y_offset = frame->linesize[0] * y0;
-+ const unsigned int y_size = frame->linesize[0] * height;
-+ // Round UV up/down to get everything
-+ const unsigned int uv_rnd = (1U << uv_shift) >> 1;
-+ const unsigned int uv_offset = frame->linesize[1] * (y0 >> uv_shift);
-+ const unsigned int uv_size = frame->linesize[1] * ((y0 + height + uv_rnd) >> uv_shift) - uv_offset;
-+
-+#if 0
-+ // *** frame->height is cropped height so not good
-+ // As all unsigned they will also reject -ve
-+ // Test individually as well as added to reject overflow
-+ av_assert0(start_line <= (unsigned int)frame->height); // ***** frame height cropped
-+ av_assert0(n <= (unsigned int)frame->height);
-+ av_assert0(start_line + n <= (unsigned int)frame->height);
-+#endif
-+
-+ if (!gpu_is_buf1(frame))
-+ {
-+ if (do_luma) {
-+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 0), mode, y_offset, y_size);
-+ }
-+ if (do_chroma) {
-+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 1), mode, uv_offset, uv_size);
-+ rpi_cache_flush_add_gm_range(rfe, gpu_buf3_gmem(frame, 2), mode, uv_offset, uv_size);
-+ }
-+ }
-+ else if (!av_rpi_is_sand_frame(frame))
-+ {
-+ const GPU_MEM_PTR_T * const gm = gpu_buf1_gmem(frame);
-+ if (do_luma) {
-+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[0] - gm->arm) + y_offset, y_size);
-+ }
-+ if (do_chroma) {
-+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[1] - gm->arm) + uv_offset, uv_size);
-+ rpi_cache_flush_add_gm_range(rfe, gm, mode, (frame->data[2] - gm->arm) + uv_offset, uv_size);
-+ }
-+ }
-+ else
-+ {
-+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+ const unsigned int xshl = av_rpi_sand_frame_xshl(frame);
-+ const unsigned int xleft = x0 & ~((stride1 >> xshl) - 1);
-+ const unsigned int block_count = (((x0 + width - xleft) << xshl) + stride1 - 1) / stride1; // Same for Y & C
-+ av_assert1(rfe->v.op_count + do_chroma + do_luma < CACHE_EL_MAX);
-+
-+ if (do_chroma)
-+ {
-+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+ b->invalidate_mode = mode;
-+ b->block_count = block_count;
-+ b->start_address = av_rpi_sand_frame_pos_c(frame, xleft >> 1, y0 >> 1);
-+ b->block_size = uv_size;
-+ b->inter_block_stride = stride1 * stride2;
-+ }
-+ if (do_luma)
-+ {
-+ struct vcsm_user_clean_invalid2_block_s * const b = rfe->v.s + rfe->v.op_count++;
-+ b->invalidate_mode = mode;
-+ b->block_count = block_count;
-+ b->start_address = av_rpi_sand_frame_pos_y(frame, xleft, y0);
-+ b->block_size = y_size;
-+ b->inter_block_stride = stride1 * stride2;
-+ }
-+ }
-+}
-+
-+// Call this to clean and invalidate a region of memory
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T *const p, const rpi_cache_flush_mode_t mode)
-+{
-+ rpi_cache_buf_t cbuf;
-+ rpi_cache_flush_env_t * rfe = rpi_cache_flush_init(&cbuf);
-+ rpi_cache_flush_add_gm_ptr(rfe, p, mode);
-+ rpi_cache_flush_finish(rfe);
-+}
-+
-+
-+// ----------------------------------------------------------------------------
-+
-+
-+// Wait abstractions - mostly so we can easily add profile code
-+static void vq_wait_pool_init(vq_wait_pool_t * const wp)
-+{
-+ unsigned int i;
-+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+ sem_init(&wp->pool[i].sem, 0, 0);
-+ wp->pool[i].next = wp->pool + i + 1;
-+ }
-+ wp->head = wp->pool + 0;
-+ wp->pool[VQ_WAIT_POOL_SIZE - 1].next = NULL;
-+}
-+
-+static void vq_wait_pool_deinit(vq_wait_pool_t * const wp)
-+{
-+ unsigned int i;
-+ wp->head = NULL;
-+ for (i = 0; i != VQ_WAIT_POOL_SIZE; ++i) {
-+ sem_destroy(&wp->pool[i].sem);
-+ wp->pool[i].next = NULL;
-+ }
-+}
-+
-+
-+// If sem_init actually takes time then maybe we want a pool...
-+static vq_wait_t * vq_wait_new(void)
-+{
-+ gpu_env_t * const ge = gpu_lock_ref();
-+ vq_wait_t * const wait = ge->wait_pool.head;
-+ ge->wait_pool.head = wait->next;
-+ wait->next = NULL;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ tto_start(&ge->ttw.active, ns_time());
-+#endif
-+
-+ gpu_unlock();
-+ return wait;
-+}
-+
-+static void vq_wait_delete(vq_wait_t * const wait)
-+{
-+ gpu_env_t * const ge = gpu_lock();
-+ wait->next = ge->wait_pool.head;
-+ ge->wait_pool.head = wait;
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ {
-+ trace_time_wait_t * const ttw = &ge->ttw;
-+ const int64_t now = ns_time();
-+ ++ttw->jcount;
-+ tto_end(&ttw->wait, now);
-+
-+ if (ttw->start0 == 0)
-+ {
-+ ttw->start0 = ttw->active.start[0];
-+ ttw->last_update = ttw->start0;
-+ }
-+ if (now - ttw->last_update > WAIT_TIME_PRINT_PERIOD)
-+ {
-+ ttw->last_update += WAIT_TIME_PRINT_PERIOD;
-+ ttw_print(ttw, now);
-+ }
-+ }
-+#endif
-+ gpu_unlock_unref(ge);
-+}
-+
-+static void vq_wait_wait(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ {
-+ const int64_t now = ns_time();
-+ gpu_env_t * const ge = gpu_lock();
-+ tto_start(&ge->ttw.wait, now);
-+ gpu_unlock();
-+ }
-+#endif
-+
-+ while (sem_wait(&wait->sem) == -1 && errno == EINTR)
-+ /* loop */;
-+}
-+
-+static void vq_wait_post(vq_wait_t * const wait)
-+{
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ {
-+ gpu_env_t *const ge = gpu_lock();
-+ tto_end(&ge->ttw.active, ns_time());
-+ gpu_unlock();
-+ }
-+#endif
-+
-+ sem_post(&wait->sem);
-+}
-+
-+
-+
-+// Header comments were wrong for these two
-+#define VPU_QPU_MASK_QPU 1
-+#define VPU_QPU_MASK_VPU 2
-+
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_env_t * vpu_qpu_job_init(vpu_qpu_job_env_t * const buf)
-+{
-+// vpu_qpu_job_env_t * vqj = calloc(1, sizeof(vpu_qpu_job_env_t));
-+ vpu_qpu_job_env_t * vqj = buf;
-+// memset(vqj, 0, sizeof(*vqj));
-+ vqj->n = 0;
-+ vqj->mask = 0;
-+ return vqj;
-+}
-+
-+void vpu_qpu_job_delete(vpu_qpu_job_env_t * const vqj)
-+{
-+// memset(vqj, 0, sizeof(*vqj));
-+// free(vqj);
-+}
-+
-+static inline struct gpu_job_s * new_job(vpu_qpu_job_env_t * const vqj)
-+{
-+ struct gpu_job_s * const j = vqj->j + vqj->n++;
-+ av_assert1(vqj->n <= VPU_QPU_JOB_MAX);
-+ return j;
-+}
-+
-+void vpu_qpu_job_add_vpu(vpu_qpu_job_env_t * const vqj, const uint32_t vpu_code,
-+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5)
-+{
-+ if (vpu_code != 0) {
-+ struct gpu_job_s *const j = new_job(vqj);
-+ vqj->mask |= VPU_QPU_MASK_VPU;
-+
-+ j->command = EXECUTE_VPU;
-+ j->callback.func = 0;
-+ j->callback.cookie = NULL;
-+ // The bottom two bits of the execute address contain no-flush flags
-+ // b0 will flush the VPU I-cache if unset so we nearly always want that set
-+ // as we never reload code
-+ j->u.v.q[0] = vpu_code | gpu->vpu_i_cache_flushed;
-+ j->u.v.q[1] = r0;
-+ j->u.v.q[2] = r1;
-+ j->u.v.q[3] = r2;
-+ j->u.v.q[4] = r3;
-+ j->u.v.q[5] = r4;
-+ j->u.v.q[6] = r5;
-+ gpu->vpu_i_cache_flushed = 1;
-+ }
-+}
-+
-+// flags are QPU_FLAGS_xxx
-+void vpu_qpu_job_add_qpu(vpu_qpu_job_env_t * const vqj, const unsigned int n, const uint32_t * const mail)
-+{
-+ if (n != 0) {
-+ struct gpu_job_s *const j = new_job(vqj);
-+ vqj->mask |= VPU_QPU_MASK_QPU;
-+
-+ j->command = EXECUTE_QPU;
-+ j->callback.func = 0;
-+ j->callback.cookie = NULL;
-+
-+ j->u.q.jobs = n;
-+#if RPI_TRACE_QPU_PROFILE_ALL
-+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU | QPU_FLAGS_PROF_CLEAR_AND_ENABLE | QPU_FLAGS_PROF_OUTPUT_COUNTS;
-+#else
-+ j->u.q.noflush = QPU_FLAGS_NO_FLUSH_VPU;
-+#endif
-+ j->u.q.timeout = 5000;
-+ memcpy(j->u.q.control, mail, n * QPU_MAIL_EL_VALS * sizeof(uint32_t));
-+ }
-+}
-+
-+// Convert callback to sem post
-+static void vpu_qpu_job_callback_wait(void * v)
-+{
-+ vq_wait_post(v);
-+}
-+
-+// Poke a user-supplied sem
-+static void vpu_qpu_job_callback_sem(void * v)
-+{
-+ sem_post((sem_t *)v);
-+}
-+
-+void vpu_qpu_job_add_sync_this(vpu_qpu_job_env_t * const vqj, vpu_qpu_wait_h * const wait_h)
-+{
-+ vq_wait_t * wait;
-+
-+ if (vqj->mask == 0) {
-+ *wait_h = NULL;
-+ return;
-+ }
-+
-+ // We are going to want a sync object
-+ wait = vq_wait_new();
-+
-+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+ // If we only posted one thing or only QPU jobs
-+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+ {
-+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+ av_assert1(j->callback.func == 0);
-+
-+ j->callback.func = vpu_qpu_job_callback_wait;
-+ j->callback.cookie = wait;
-+ }
-+ else
-+ {
-+ struct gpu_job_s *const j = new_job(vqj);
-+
-+ j->command = EXECUTE_SYNC;
-+ j->u.s.mask = vqj->mask;
-+ j->callback.func = vpu_qpu_job_callback_wait;
-+ j->callback.cookie = wait;
-+ }
-+
-+ vqj->mask = 0;
-+ *wait_h = wait;
-+}
-+
-+// Returns 0 if no sync added ('cos Q empty), 1 if sync added
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem)
-+{
-+ // If nothing on q then just return
-+ if (vqj->mask == 0)
-+ return 0;
-+
-+ // There are 2 VPU Qs & 1 QPU Q so we can collapse sync
-+ // If we only posted one thing or only QPU jobs
-+ if (vqj->n == 1 || vqj->mask == VPU_QPU_MASK_QPU)
-+ {
-+ struct gpu_job_s * const j = vqj->j + (vqj->n - 1);
-+ av_assert1(j->callback.func == 0);
-+
-+ j->callback.func = vpu_qpu_job_callback_sem;
-+ j->callback.cookie = sem;
-+ }
-+ else
-+ {
-+ struct gpu_job_s *const j = new_job(vqj);
-+
-+ j->command = EXECUTE_SYNC;
-+ j->u.s.mask = vqj->mask;
-+ j->callback.func = vpu_qpu_job_callback_sem;
-+ j->callback.cookie = sem;
-+ }
-+
-+ vqj->mask = 0;
-+ return 1;
-+}
-+
-+
-+int vpu_qpu_job_start(vpu_qpu_job_env_t * const vqj)
-+{
-+ if (vqj->n == 0)
-+ return 0;
-+
-+ return vc_gpuserv_execute_code(vqj->n, vqj->j);
-+}
-+
-+// Simple wrapper of start + delete
-+int vpu_qpu_job_finish(vpu_qpu_job_env_t * const vqj)
-+{
-+ int rv;
-+ rv = vpu_qpu_job_start(vqj);
-+ vpu_qpu_job_delete(vqj);
-+ return rv;
-+}
-+
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h)
-+{
-+ if (wait_h != NULL)
-+ {
-+ vq_wait_t * const wait = *wait_h;
-+ if (wait != NULL) {
-+ *wait_h = NULL;
-+ vq_wait_wait(wait);
-+ vq_wait_delete(wait);
-+ }
-+ }
-+}
-+
-+int vpu_qpu_init()
-+{
-+ gpu_env_t * const ge = gpu_lock_ref();
-+ if (ge == NULL)
-+ return -1;
-+
-+ if (ge->init_count++ == 0)
-+ {
-+ vc_gpuserv_init();
-+ }
-+
-+ gpu_unlock();
-+ return 0;
-+}
-+
-+void vpu_qpu_term()
-+{
-+ gpu_env_t * const ge = gpu_lock();
-+
-+ if (--ge->init_count == 0) {
-+ vc_gpuserv_deinit();
-+
-+#if RPI_TRACE_TIME_VPU_QPU_WAIT
-+ ttw_print(&ge->ttw, ns_time());
-+#endif
-+ }
-+
-+ gpu_unlock_unref(ge);
-+}
-+
-+uint32_t qpu_fn(const int * const mc_fn)
-+{
-+ return gpu->qpu_code_gm_ptr.vc + ((const char *)mc_fn - (const char *)ff_hevc_rpi_shader);
-+}
-+
-+uint32_t qpu_dummy(void)
-+{
-+ return gpu->dummy_gm_ptr.vc;
-+}
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth)
-+{
-+ // Dummy values we can catch with emulation
-+ qf->y_pxx = ~1U;
-+ qf->y_bxx = ~2U;
-+ qf->y_p00 = ~3U;
-+ qf->y_b00 = ~4U;
-+ qf->c_pxx = ~5U;
-+ qf->c_bxx = ~6U;
-+
-+ switch (bit_depth) {
-+ case 8:
-+ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+ qf->y_pxx = qpu_fn(mc_filter_y_pxx);
-+ qf->y_bxx = qpu_fn(mc_filter_y_bxx);
-+ qf->y_p00 = qpu_fn(mc_filter_y_p00);
-+ qf->y_b00 = qpu_fn(mc_filter_y_b00);
-+ qf->c_pxx = qpu_fn(mc_filter_c_p);
-+ qf->c_pxx_l1 = qpu_fn(mc_filter_c_p_l1);
-+ qf->c_bxx = qpu_fn(mc_filter_c_b);
-+ break;
-+ case 10:
-+ qf->c_pxx = qpu_fn(mc_filter_c10_p);
-+ qf->c_pxx_l1 = qpu_fn(mc_filter_c10_p_l1);
-+ qf->c_bxx = qpu_fn(mc_filter_c10_b);
-+ qf->y_pxx = qpu_fn(mc_filter_y10_pxx);
-+ qf->y_bxx = qpu_fn(mc_filter_y10_bxx);
-+ qf->y_p00 = qpu_fn(mc_filter_y10_p00);
-+ qf->y_b00 = qpu_fn(mc_filter_y10_b00);
-+ break;
-+ default:
-+ return -1;
-+ }
-+ return 0;
-+}
-+
-diff --git a/libavcodec/rpi_qpu.h b/libavcodec/rpi_qpu.h
-new file mode 100644
-index 0000000000..e1b4d9c39e
---- /dev/null
-+++ b/libavcodec/rpi_qpu.h
-@@ -0,0 +1,229 @@
-+#ifndef RPI_QPU_H
-+#define RPI_QPU_H
-+
-+#pragma GCC diagnostic push
-+// Many many redundant decls in the header files
-+#pragma GCC diagnostic ignored "-Wredundant-decls"
-+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
-+#include "interface/vmcs_host/vc_vchi_gpuserv.h"
-+#pragma GCC diagnostic pop
-+
-+
-+#define RPI_ONE_BUF 1
-+
-+typedef struct gpu_mem_ptr_s {
-+ unsigned char *arm; // Pointer to memory mapped on ARM side
-+ int vc_handle; // Videocore handle of relocatable memory
-+ int vcsm_handle; // Handle for use by VCSM
-+ int vc; // Address for use in GPU code
-+ int numbytes; // Size of memory block
-+} GPU_MEM_PTR_T;
-+
-+// General GPU functions
-+extern int gpu_malloc_cached(int numbytes, GPU_MEM_PTR_T *p);
-+extern int gpu_malloc_uncached(int numbytes, GPU_MEM_PTR_T *p);
-+extern void gpu_free(GPU_MEM_PTR_T * const p);
-+
-+#include "libavutil/frame.h"
-+#if !RPI_ONE_BUF
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[0]);
-+ return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[1]);
-+ return p->vc;
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+ GPU_MEM_PTR_T *p = av_buffer_pool_opaque(frame->buf[2]);
-+ return p->vc;
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[1]);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+ return *(GPU_MEM_PTR_T *)av_buffer_pool_opaque(frame->buf[2]);
-+}
-+
-+#else
-+
-+static inline int gpu_is_buf1(const AVFrame * const frame)
-+{
-+ return frame->buf[1] == NULL;
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf1_gmem(const AVFrame * const frame)
-+{
-+ return av_buffer_get_opaque(frame->buf[0]);
-+}
-+
-+static inline GPU_MEM_PTR_T * gpu_buf3_gmem(const AVFrame * const frame, const unsigned int n)
-+{
-+ return av_buffer_pool_opaque(frame->buf[n]);
-+}
-+
-+static inline uint32_t get_vc_address3(const AVFrame * const frame, const unsigned int n)
-+{
-+ const GPU_MEM_PTR_T * const gm = gpu_is_buf1(frame) ? gpu_buf1_gmem(frame) : gpu_buf3_gmem(frame, n);
-+ return gm->vc + (frame->data[n] - gm->arm);
-+}
-+
-+
-+static inline uint32_t get_vc_address_y(const AVFrame * const frame) {
-+ return get_vc_address3(frame, 0);
-+}
-+
-+static inline uint32_t get_vc_address_u(const AVFrame * const frame) {
-+ return get_vc_address3(frame, 1);
-+}
-+
-+static inline uint32_t get_vc_address_v(const AVFrame * const frame) {
-+ return get_vc_address3(frame, 2);
-+}
-+
-+#if 0
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_y(const AVFrame * const frame) {
-+ if (gpu_is_buf1(frame))
-+ {
-+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+ g.numbytes = frame->data[1] - frame->data[0];
-+ return g;
-+ }
-+ else
-+ return *gpu_buf3_gmem(frame, 0);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_u(const AVFrame * const frame) {
-+ if (gpu_is_buf1(frame))
-+ {
-+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+ g.arm += frame->data[1] - frame->data[0];
-+ g.vc += frame->data[1] - frame->data[0];
-+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
-+ return g;
-+ }
-+ else
-+ return *gpu_buf3_gmem(frame, 1);
-+}
-+
-+static inline GPU_MEM_PTR_T get_gpu_mem_ptr_v(const AVFrame * const frame) {
-+ if (gpu_is_buf1(frame))
-+ {
-+ GPU_MEM_PTR_T g = *gpu_buf1_gmem(frame);
-+ g.arm += frame->data[2] - frame->data[0];
-+ g.vc += frame->data[2] - frame->data[0];
-+ g.numbytes = frame->data[2] - frame->data[1]; // chroma size
-+ return g;
-+ }
-+ else
-+ return *gpu_buf3_gmem(frame, 2);
-+}
-+#endif
-+#endif
-+
-+// Cache flush stuff
-+
-+struct rpi_cache_flush_env_s;
-+typedef struct rpi_cache_flush_env_s rpi_cache_flush_env_t;
-+
-+typedef struct {uint32_t t[33];} rpi_cache_buf_t;
-+
-+rpi_cache_flush_env_t * rpi_cache_flush_init(rpi_cache_buf_t * const buf);
-+// Free env without flushing
-+void rpi_cache_flush_abort(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & clear but do not free the env
-+int rpi_cache_flush_execute(rpi_cache_flush_env_t * const rfe);
-+// Do the accumulated flush & free the env
-+int rpi_cache_flush_finish(rpi_cache_flush_env_t * const rfe);
-+
-+typedef enum
-+{
-+ RPI_CACHE_FLUSH_MODE_INVALIDATE = 1,
-+ RPI_CACHE_FLUSH_MODE_WRITEBACK = 2,
-+ RPI_CACHE_FLUSH_MODE_WB_INVALIDATE = 3
-+} rpi_cache_flush_mode_t;
-+
-+void rpi_cache_flush_add_gm_ptr(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_gm_range(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const rpi_cache_flush_mode_t mode,
-+ const unsigned int offset, const unsigned int size);
-+void rpi_cache_flush_add_gm_blocks(rpi_cache_flush_env_t * const rfe, const GPU_MEM_PTR_T * const gm, const unsigned int mode,
-+ const unsigned int offset0, const unsigned int block_size, const unsigned int blocks, const unsigned int block_stride);
-+void rpi_cache_flush_add_frame(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode);
-+void rpi_cache_flush_add_frame_block(rpi_cache_flush_env_t * const rfe, const AVFrame * const frame, const rpi_cache_flush_mode_t mode,
-+ const unsigned int x0, const unsigned int y0, const unsigned int width, const unsigned int height,
-+ const unsigned int uv_shift, const int do_luma, const int do_chroma);
-+
-+// init, add, finish for one gm ptr
-+void rpi_cache_flush_one_gm_ptr(const GPU_MEM_PTR_T * const p, const rpi_cache_flush_mode_t mode);
-+
-+
-+// QPU specific functions
-+
-+typedef struct HEVCRpiQpu {
-+ uint32_t c_pxx;
-+ uint32_t c_pxx_l1;
-+ uint32_t c_bxx;
-+ uint32_t y_pxx;
-+ uint32_t y_bxx;
-+ uint32_t y_p00;
-+ uint32_t y_b00;
-+} HEVCRpiQpu;
-+
-+int rpi_hevc_qpu_init_fn(HEVCRpiQpu * const qf, const unsigned int bit_depth);
-+
-+uint32_t qpu_fn(const int * const mc_fn);
-+uint32_t qpu_dummy(void);
-+
-+#define QPU_N_GRP 4
-+#define QPU_N_MAX 12
-+
-+#define QPU_MAIL_EL_VALS 2
-+
-+struct vpu_qpu_wait_s;
-+typedef struct vq_wait_s * vpu_qpu_wait_h;
-+
-+// VPU specific functions
-+
-+struct vpu_qpu_job_env_s;
-+typedef struct vpu_qpu_job_env_s * vpu_qpu_job_h;
-+
-+#define VPU_QPU_JOB_MAX 4
-+struct vpu_qpu_job_env_s
-+{
-+ unsigned int n;
-+ unsigned int mask;
-+ struct gpu_job_s j[VPU_QPU_JOB_MAX];
-+};
-+typedef struct vpu_qpu_job_env_s vpu_qpu_job_env_t;
-+
-+vpu_qpu_job_h vpu_qpu_job_init(vpu_qpu_job_env_t * const buf);
-+void vpu_qpu_job_delete(const vpu_qpu_job_h vqj);
-+void vpu_qpu_job_add_vpu(const vpu_qpu_job_h vqj, const uint32_t vpu_code,
-+ const unsigned r0, const unsigned r1, const unsigned r2, const unsigned r3, const unsigned r4, const unsigned r5);
-+void vpu_qpu_job_add_qpu(const vpu_qpu_job_h vqj, const unsigned int n, const uint32_t * const mail);
-+void vpu_qpu_job_add_sync_this(const vpu_qpu_job_h vqj, vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_job_add_sync_sem(vpu_qpu_job_env_t * const vqj, sem_t * const sem);
-+int vpu_qpu_job_start(const vpu_qpu_job_h vqj);
-+int vpu_qpu_job_finish(const vpu_qpu_job_h vqj);
-+
-+extern unsigned int vpu_get_fn(const unsigned int bit_depth);
-+extern unsigned int vpu_get_constants(void);
-+
-+// Waits for previous post_codee to complete and Will null out *wait_h after use
-+void vpu_qpu_wait(vpu_qpu_wait_h * const wait_h);
-+int vpu_qpu_init(void);
-+void vpu_qpu_term(void);
-+
-+extern int gpu_get_mailbox(void);
-+void gpu_ref(void);
-+void gpu_unref(void);
-+
-+#endif
-diff --git a/libavcodec/rpi_zc.c b/libavcodec/rpi_zc.c
-new file mode 100644
-index 0000000000..185288da5a
---- /dev/null
-+++ b/libavcodec/rpi_zc.c
-@@ -0,0 +1,741 @@
-+#include "libavcodec/avcodec.h"
-+#include "rpi_qpu.h"
-+#include "rpi_mailbox.h"
-+#include "rpi_zc.h"
-+#include "libavutil/avassert.h"
-+#include "libavutil/rpi_sand_fns.h"
-+#include <pthread.h>
-+
-+#include "libavutil/buffer_internal.h"
-+#include <interface/vctypes/vc_image_types.h>
-+
-+#define TRACE_ALLOC 0
-+
-+struct ZcPoolEnt;
-+
-+typedef struct ZcPool
-+{
-+ int numbytes;
-+ unsigned int n;
-+ struct ZcPoolEnt * head;
-+ pthread_mutex_t lock;
-+} ZcPool;
-+
-+typedef struct ZcPoolEnt
-+{
-+ // It is important that we start with gmem as other bits of code will expect to see that
-+ GPU_MEM_PTR_T gmem;
-+ unsigned int n;
-+ struct ZcPoolEnt * next;
-+ struct ZcPool * pool;
-+} ZcPoolEnt;
-+
-+#define ALLOC_PAD 0
-+#define ALLOC_ROUND 0x1000
-+#define ALLOC_N_OFFSET 0
-+#define STRIDE_ROUND 64
-+#define STRIDE_OR 0
-+
-+#define DEBUG_ZAP0_BUFFERS 0
-+
-+
-+static ZcPoolEnt * zc_pool_ent_alloc(ZcPool * const pool, const unsigned int req_size)
-+{
-+ ZcPoolEnt * const zp = av_malloc(sizeof(ZcPoolEnt));
-+
-+ // Round up to 4k & add 4k
-+ const unsigned int alloc_size = (req_size + ALLOC_PAD + ALLOC_ROUND - 1) & ~(ALLOC_ROUND - 1);
-+
-+ if (zp == NULL) {
-+ av_log(NULL, AV_LOG_ERROR, "av_malloc(ZcPoolEnt) failed\n");
-+ goto fail0;
-+ }
-+
-+ if (gpu_malloc_cached(alloc_size, &zp->gmem) != 0)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "av_gpu_malloc_cached(%d) failed\n", alloc_size);
-+ goto fail1;
-+ }
-+
-+#if TRACE_ALLOC
-+ printf("%s: Alloc %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
-+#endif
-+
-+ pool->numbytes = zp->gmem.numbytes;
-+ zp->next = NULL;
-+ zp->pool = pool;
-+ zp->n = pool->n++;
-+ return zp;
-+
-+fail1:
-+ av_free(zp);
-+fail0:
-+ return NULL;
-+}
-+
-+static void zc_pool_ent_free(ZcPoolEnt * const zp)
-+{
-+#if TRACE_ALLOC
-+ printf("%s: Free %#x bytes @ %p\n", __func__, zp->gmem.numbytes, zp->gmem.arm);
-+#endif
-+
-+ gpu_free(&zp->gmem);
-+ av_free(zp);
-+}
-+
-+static void zc_pool_flush(ZcPool * const pool)
-+{
-+ ZcPoolEnt * p = pool->head;
-+ pool->head = NULL;
-+ pool->numbytes = -1;
-+
-+ while (p != NULL)
-+ {
-+ ZcPoolEnt * const zp = p;
-+ p = p->next;
-+ zc_pool_ent_free(zp);
-+ }
-+}
-+
-+static ZcPoolEnt * zc_pool_alloc(ZcPool * const pool, const int req_bytes)
-+{
-+ ZcPoolEnt * zp;
-+ int numbytes;
-+
-+ pthread_mutex_lock(&pool->lock);
-+
-+ numbytes = pool->numbytes;
-+
-+ // If size isn't close then dump the pool
-+ // Close in this context means within 128k
-+ if (req_bytes > numbytes || req_bytes + 0x20000 < numbytes)
-+ {
-+ zc_pool_flush(pool);
-+ numbytes = req_bytes;
-+ }
-+
-+ if (pool->head != NULL)
-+ {
-+ zp = pool->head;
-+ pool->head = zp->next;
-+ }
-+ else
-+ {
-+ zp = zc_pool_ent_alloc(pool, numbytes);
-+ }
-+
-+ pthread_mutex_unlock(&pool->lock);
-+
-+ // Start with our buffer empty of preconceptions
-+// rpi_cache_flush_one_gm_ptr(&zp->gmem, RPI_CACHE_FLUSH_MODE_WB_INVALIDATE);
-+
-+ return zp;
-+}
-+
-+static void zc_pool_free(ZcPoolEnt * const zp)
-+{
-+ ZcPool * const pool = zp == NULL ? NULL : zp->pool;
-+ if (zp != NULL)
-+ {
-+ pthread_mutex_lock(&pool->lock);
-+#if TRACE_ALLOC
-+ printf("%s: Recycle %#x, %#x\n", __func__, pool->numbytes, zp->gmem.numbytes);
-+#endif
-+
-+ if (pool->numbytes == zp->gmem.numbytes)
-+ {
-+ zp->next = pool->head;
-+ pool->head = zp;
-+ pthread_mutex_unlock(&pool->lock);
-+ }
-+ else
-+ {
-+ pthread_mutex_unlock(&pool->lock);
-+ zc_pool_ent_free(zp);
-+ }
-+ }
-+}
-+
-+static void
-+zc_pool_init(ZcPool * const pool)
-+{
-+ pool->numbytes = -1;
-+ pool->head = NULL;
-+ pthread_mutex_init(&pool->lock, NULL);
-+}
-+
-+static void
-+zc_pool_destroy(ZcPool * const pool)
-+{
-+ pool->numbytes = -1;
-+ zc_pool_flush(pool);
-+ pthread_mutex_destroy(&pool->lock);
-+}
-+
-+typedef struct ZcOldCtxVals
-+{
-+ int thread_safe_callbacks;
-+ int (*get_buffer2)(struct AVCodecContext *s, AVFrame *frame, int flags);
-+ void * get_buffer_context;
-+} ZcOldCtxVals;
-+
-+typedef struct AVZcEnv
-+{
-+ unsigned int refcount;
-+ ZcPool pool;
-+ ZcOldCtxVals old;
-+} ZcEnv;
-+
-+// Callback when buffer unrefed to zero
-+static void rpi_free_display_buffer(void *opaque, uint8_t *data)
-+{
-+ ZcPoolEnt *const zp = opaque;
-+// printf("%s: data=%p\n", __func__, data);
-+ zc_pool_free(zp);
-+}
-+
-+static inline GPU_MEM_PTR_T * pic_gm_ptr(AVBufferRef * const buf)
-+{
-+ // Kludge where we check the free fn to check this is really
-+ // one of our buffers - can't think of a better way
-+ return buf == NULL || buf->buffer->free != rpi_free_display_buffer ? NULL :
-+ av_buffer_get_opaque(buf);
-+}
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+ const int format, const unsigned int video_width, const unsigned int video_height)
-+{
-+ AVRpiZcFrameGeometry geo;
-+
-+ switch (format)
-+ {
-+ case AV_PIX_FMT_YUV420P:
-+ geo.stride_y = ((video_width + 32 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+ geo.stride_c = geo.stride_y / 2;
-+ geo.height_y = (video_height + 32 + 31) & ~31;
-+ geo.height_c = geo.height_y / 2;
-+ geo.planes_c = 2;
-+ geo.stripes = 1;
-+ geo.bytes_per_pel = 1;
-+ break;
-+
-+ case AV_PIX_FMT_YUV420P10:
-+ geo.stride_y = ((video_width * 2 + 64 + STRIDE_ROUND - 1) & ~(STRIDE_ROUND - 1)) | STRIDE_OR;
-+ geo.stride_c = geo.stride_y / 2;
-+ geo.height_y = (video_height + 32 + 31) & ~31;
-+ geo.height_c = geo.height_y / 2;
-+ geo.planes_c = 2;
-+ geo.stripes = 1;
-+ geo.bytes_per_pel = 2;
-+ break;
-+
-+ case AV_PIX_FMT_SAND128:
-+ {
-+ const unsigned int stripe_w = 128;
-+
-+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+ static VC_IMAGE_T img = {0};
-+
-+ // Given the overhead of calling the mailbox keep a stashed
-+ // copy as we will almost certainly just want the same numbers again
-+ // but that means we need a lock
-+ pthread_mutex_lock(&sand_lock);
-+
-+ if (img.width != video_width || img.height != video_height)
-+ {
-+ VC_IMAGE_T new_img = {
-+ .type = VC_IMAGE_YUV_UV,
-+ .width = video_width,
-+ .height = video_height
-+ };
-+
-+ gpu_ref();
-+ mbox_get_image_params(gpu_get_mailbox(), &new_img);
-+ gpu_unref();
-+ img = new_img;
-+ }
-+
-+ geo.stride_y = stripe_w;
-+ geo.stride_c = stripe_w;
-+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+ geo.height_c = img.pitch / stripe_w - geo.height_y;
-+ geo.planes_c = 1;
-+ geo.stripes = (video_width + stripe_w - 1) / stripe_w;
-+ geo.bytes_per_pel = 1;
-+
-+ pthread_mutex_unlock(&sand_lock);
-+
-+ av_assert0((int)geo.height_y > 0 && (int)geo.height_c > 0);
-+ av_assert0(geo.height_y >= video_height && geo.height_c >= video_height / 2);
-+ break;
-+ }
-+
-+ case AV_PIX_FMT_SAND64_16:
-+ case AV_PIX_FMT_SAND64_10:
-+ {
-+ const unsigned int stripe_w = 128; // bytes
-+
-+ static pthread_mutex_t sand_lock = PTHREAD_MUTEX_INITIALIZER;
-+ static VC_IMAGE_T img = {0};
-+
-+ // Given the overhead of calling the mailbox keep a stashed
-+ // copy as we will almost certainly just want the same numbers again
-+ // but that means we need a lock
-+ pthread_mutex_lock(&sand_lock);
-+
-+ if (img.width != video_width || img.height != video_height)
-+ {
-+ VC_IMAGE_T new_img = {
-+ .type = VC_IMAGE_YUV_UV_16,
-+ .width = video_width,
-+ .height = video_height
-+ };
-+
-+ gpu_ref();
-+ mbox_get_image_params(gpu_get_mailbox(), &new_img);
-+ gpu_unref();
-+ img = new_img;
-+ }
-+
-+ geo.stride_y = stripe_w;
-+ geo.stride_c = stripe_w;
-+ geo.height_y = ((intptr_t)img.extra.uv.u - (intptr_t)img.image_data) / stripe_w;
-+ geo.height_c = img.pitch / stripe_w - geo.height_y;
-+ geo.planes_c = 1;
-+ geo.stripes = (video_width * 2 + stripe_w - 1) / stripe_w;
-+ geo.bytes_per_pel = 2;
-+
-+ pthread_mutex_unlock(&sand_lock);
-+ break;
-+ }
-+
-+ default:
-+ memset(&geo, 0, sizeof(geo));
-+ break;
-+ }
-+ return geo;
-+}
-+
-+
-+static AVBufferRef * rpi_buf_pool_alloc(ZcPool * const pool, int size)
-+{
-+ ZcPoolEnt *const zp = zc_pool_alloc(pool, size);
-+ AVBufferRef * buf;
-+ intptr_t idata = (intptr_t)zp->gmem.arm;
-+#if ALLOC_N_OFFSET != 0
-+ intptr_t noff = (zp->n * ALLOC_N_OFFSET) & (ALLOC_PAD - 1);
-+#endif
-+
-+ if (zp == NULL) {
-+ av_log(NULL, AV_LOG_ERROR, "zc_pool_alloc(%d) failed\n", size);
-+ goto fail0;
-+ }
-+
-+#if ALLOC_N_OFFSET != 0
-+ idata = ((idata & ~(ALLOC_PAD - 1)) | noff) + (((idata & (ALLOC_PAD - 1)) > noff) ? ALLOC_PAD : 0);
-+#endif
-+
-+#if DEBUG_ZAP0_BUFFERS
-+ memset((void*)idata, 0, size);
-+#endif
-+
-+ if ((buf = av_buffer_create((void *)idata, size, rpi_free_display_buffer, zp, AV_BUFFER_FLAG_READONLY)) == NULL)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "av_buffer_create() failed\n");
-+ goto fail2;
-+ }
-+
-+ return buf;
-+
-+fail2:
-+ zc_pool_free(zp);
-+fail0:
-+ return NULL;
-+}
-+
-+static int rpi_get_display_buffer(ZcEnv *const zc, AVFrame * const frame)
-+{
-+ const AVRpiZcFrameGeometry geo = av_rpi_zc_frame_geometry(frame->format, frame->width, frame->height);
-+ const unsigned int size_y = geo.stride_y * geo.height_y;
-+ const unsigned int size_c = geo.stride_c * geo.height_c;
-+ const unsigned int size_pic = (size_y + size_c * geo.planes_c) * geo.stripes;
-+ AVBufferRef * buf;
-+ unsigned int i;
-+
-+// printf("Do local alloc: format=%#x, %dx%d: %u\n", frame->format, frame->width, frame->height, size_pic);
-+
-+ if ((buf = rpi_buf_pool_alloc(&zc->pool, size_pic)) == NULL)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "rpi_get_display_buffer: Failed to get buffer from pool\n");
-+ return AVERROR(ENOMEM);
-+ }
-+
-+ for (i = 0; i < AV_NUM_DATA_POINTERS; i++) {
-+ frame->buf[i] = NULL;
-+ frame->data[i] = NULL;
-+ frame->linesize[i] = 0;
-+ }
-+
-+ frame->buf[0] = buf;
-+
-+ frame->linesize[0] = geo.stride_y;
-+ frame->linesize[1] = geo.stride_c;
-+ frame->linesize[2] = geo.stride_c;
-+ // abuse: linesize[3] = "stripe stride"
-+ // stripe_stride is NOT the stride between slices it is (that / geo.stride_y).
-+ // In a general case this makes the calculation an xor and multiply rather
-+ // than a divide and multiply
-+ if (geo.stripes > 1)
-+ frame->linesize[3] = geo.height_y + geo.height_c;
-+
-+ frame->data[0] = buf->data;
-+ frame->data[1] = frame->data[0] + size_y;
-+ if (geo.planes_c > 1)
-+ frame->data[2] = frame->data[1] + size_c;
-+
-+ frame->extended_data = frame->data;
-+ // Leave extended buf alone
-+
-+#if RPI_ZC_SAND_8_IN_10_BUF != 0
-+ // *** If we intend to use this for real we will want a 2nd buffer pool
-+ frame->buf[RPI_ZC_SAND_8_IN_10_BUF] = rpi_buf_pool_alloc(&zc->pool, size_pic); // *** 2 * wanted size - kludge
-+#endif
-+
-+ return 0;
-+}
-+
-+#define RPI_GET_BUFFER2 1
-+
-+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags)
-+{
-+#if !RPI_GET_BUFFER2
-+ return avcodec_default_get_buffer2(s, frame, flags);
-+#else
-+ int rv;
-+
-+ if ((s->codec->capabilities & AV_CODEC_CAP_DR1) == 0)
-+ {
-+// printf("Do default alloc: format=%#x\n", frame->format);
-+ rv = avcodec_default_get_buffer2(s, frame, flags);
-+ }
-+ else if (frame->format == AV_PIX_FMT_YUV420P ||
-+ av_rpi_is_sand_frame(frame))
-+ {
-+ rv = rpi_get_display_buffer(s->get_buffer_context, frame);
-+ }
-+ else
-+ {
-+ rv = avcodec_default_get_buffer2(s, frame, flags);
-+ }
-+
-+#if 0
-+ printf("%s: fmt:%d, %dx%d lsize=%d/%d/%d/%d data=%p/%p/%p bref=%p/%p/%p opaque[0]=%p\n", __func__,
-+ frame->format, frame->width, frame->height,
-+ frame->linesize[0], frame->linesize[1], frame->linesize[2], frame->linesize[3],
-+ frame->data[0], frame->data[1], frame->data[2],
-+ frame->buf[0], frame->buf[1], frame->buf[2],
-+ av_buffer_get_opaque(frame->buf[0]));
-+#endif
-+ return rv;
-+#endif
-+}
-+
-+
-+static AVBufferRef * zc_copy(struct AVCodecContext * const s,
-+ const AVFrame * const src)
-+{
-+ AVFrame dest_frame;
-+ AVFrame * const dest = &dest_frame;
-+ unsigned int i;
-+ uint8_t * psrc, * pdest;
-+
-+ dest->format = src->format;
-+ dest->width = src->width;
-+ dest->height = src->height;
-+
-+ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
-+ {
-+ return NULL;
-+ }
-+
-+ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
-+ i != dest->height;
-+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
-+ {
-+ memcpy(pdest, psrc, dest->width);
-+ }
-+ for (i = 0, psrc = src->data[1], pdest = dest->data[1];
-+ i != dest->height / 2;
-+ ++i, psrc += src->linesize[1], pdest += dest->linesize[1])
-+ {
-+ memcpy(pdest, psrc, dest->width / 2);
-+ }
-+ for (i = 0, psrc = src->data[2], pdest = dest->data[2];
-+ i != dest->height / 2;
-+ ++i, psrc += src->linesize[2], pdest += dest->linesize[2])
-+ {
-+ memcpy(pdest, psrc, dest->width / 2);
-+ }
-+
-+ return dest->buf[0];
-+}
-+
-+
-+static AVBufferRef * zc_420p10_to_sand128(struct AVCodecContext * const s,
-+ const AVFrame * const src)
-+{
-+ AVFrame dest_frame;
-+ AVFrame * const dest = &dest_frame;
-+ unsigned int i;
-+ uint8_t * psrc, * psrc2, * pdest;
-+
-+ memset(dest, 0, sizeof(*dest));
-+ dest->format = AV_PIX_FMT_SAND128;
-+ dest->width = src->width;
-+ dest->height = src->height;
-+
-+ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
-+ {
-+ return NULL;
-+ }
-+
-+ // Y
-+ for (i = 0, psrc = src->data[0], pdest = dest->data[0];
-+ i != dest->height;
-+ ++i, psrc += src->linesize[0], pdest += dest->linesize[0])
-+ {
-+ uint16_t * s = (uint16_t*)psrc;
-+ uint8_t * d = pdest;
-+ for (unsigned int k = 0; k < dest->width; k += dest->linesize[0])
-+ {
-+ const unsigned int n = FFMIN(dest->linesize[0], dest->width - k);
-+ for (unsigned int j = 0; j != n; ++j)
-+ *d++ = (uint8_t)(*s++ >> 2);
-+ d += (dest->linesize[3] - 1) * dest->linesize[0];
-+ }
-+ }
-+
-+ // C
-+ for (i = 0, psrc = src->data[1], psrc2 = src->data[2], pdest = dest->data[1];
-+ i != dest->height / 2;
-+ ++i, psrc += src->linesize[1], psrc2 += src->linesize[2], pdest += dest->linesize[1])
-+ {
-+ const uint16_t * su = (uint16_t*)psrc;
-+ const uint16_t * sv = (uint16_t*)psrc2;
-+ uint8_t * d = pdest;
-+ for (unsigned int k = 0; k < dest->width; k += dest->linesize[1])
-+ {
-+ const unsigned int n = FFMIN(dest->linesize[1], dest->width - k) / 2;
-+ for (unsigned int j = 0; j != n; ++j)
-+ {
-+ *d++ = (uint8_t)(*su++ >> 2);
-+ *d++ = (uint8_t)(*sv++ >> 2);
-+ }
-+ d += (dest->linesize[3] - 1) * dest->linesize[1];
-+ }
-+ }
-+
-+ return dest->buf[0];
-+}
-+
-+
-+static AVBufferRef * zc_sand64_16_to_sand128(struct AVCodecContext * const s,
-+ const AVFrame * const src, const unsigned int src_bits)
-+{
-+ AVFrame dest_frame = {
-+ .format = AV_PIX_FMT_SAND128,
-+ .width = src->width,
-+ .height = src->height
-+ };
-+ AVFrame * const dest = &dest_frame;
-+ const unsigned int shr = src_bits - 8;
-+
-+ if (rpi_get_display_buffer(s->get_buffer_context, dest) != 0)
-+ {
-+ return NULL;
-+ }
-+
-+ // Y
-+ av_rpi_sand16_to_sand8(dest->data[0], dest->linesize[0], av_rpi_sand_frame_stride2(dest),
-+ src->data[0], src->linesize[0], av_rpi_sand_frame_stride2(dest),
-+ src->width, src->height, shr);
-+ // C
-+ av_rpi_sand16_to_sand8(dest->data[1], dest->linesize[1], av_rpi_sand_frame_stride2(dest),
-+ src->data[1], src->linesize[1], av_rpi_sand_frame_stride2(dest),
-+ src->width, src->height / 2, shr);
-+
-+ return dest->buf[0];
-+}
-+
-+
-+
-+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+ const AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy)
-+{
-+ assert(s != NULL);
-+
-+ if (frame->format != AV_PIX_FMT_YUV420P &&
-+ frame->format != AV_PIX_FMT_YUV420P10 &&
-+ !av_rpi_is_sand_frame(frame))
-+ {
-+ av_log(s, AV_LOG_WARNING, "%s: *** Format not SAND/YUV420P: %d\n", __func__, frame->format);
-+ return NULL;
-+ }
-+
-+ if (frame->buf[1] != NULL || frame->format != expected_format)
-+ {
-+#if RPI_ZC_SAND_8_IN_10_BUF
-+ if (frame->format == AV_PIX_FMT_SAND64_10 && expected_format == AV_PIX_FMT_SAND128 && frame->buf[RPI_ZC_SAND_8_IN_10_BUF] != NULL)
-+ {
-+// av_log(s, AV_LOG_INFO, "%s: --- found buf[4]\n", __func__);
-+ return av_buffer_ref(frame->buf[RPI_ZC_SAND_8_IN_10_BUF]);
-+ }
-+#endif
-+
-+ if (maycopy)
-+ {
-+ if (frame->buf[1] != NULL)
-+ av_log(s, AV_LOG_INFO, "%s: *** Not a single buf frame: copying\n", __func__);
-+ else
-+ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format %d: copying to %d\n", __func__, frame->format, expected_format);
-+
-+ switch (frame->format)
-+ {
-+ case AV_PIX_FMT_YUV420P10:
-+ return zc_420p10_to_sand128(s, frame);
-+
-+ case AV_PIX_FMT_SAND64_10:
-+ return zc_sand64_16_to_sand128(s, frame, 10);
-+
-+ default:
-+ return zc_copy(s, frame);
-+ }
-+ }
-+ else
-+ {
-+ if (frame->buf[1] != NULL)
-+ av_log(s, AV_LOG_WARNING, "%s: *** Not a single buf frame: buf[1] != NULL\n", __func__);
-+ else
-+ av_log(s, AV_LOG_INFO, "%s: *** Unexpected frame format: %d != %d\n", __func__, frame->format, expected_format);
-+ return NULL;
-+ }
-+ }
-+
-+ if (pic_gm_ptr(frame->buf[0]) == NULL)
-+ {
-+ if (maycopy)
-+ {
-+ av_log(s, AV_LOG_INFO, "%s: *** Not one of our buffers: copying\n", __func__);
-+ return zc_copy(s, frame);
-+ }
-+ else
-+ {
-+ av_log(s, AV_LOG_WARNING, "%s: *** Not one of our buffers: NULL\n", __func__);
-+ return NULL;
-+ }
-+ }
-+
-+ return av_buffer_ref(frame->buf[0]);
-+}
-+
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref)
-+{
-+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+ return p == NULL ? -1 : p->vc_handle;
-+}
-+
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref)
-+{
-+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+ return p == NULL ? 0 : fr_ref->data - p->arm;
-+}
-+
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref)
-+{
-+ return fr_ref == NULL ? 0 : fr_ref->size;
-+}
-+
-+
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref)
-+{
-+ const GPU_MEM_PTR_T * const p = pic_gm_ptr(fr_ref);
-+ return p == NULL ? 0 : p->numbytes;
-+}
-+
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref)
-+{
-+ if (fr_ref != NULL)
-+ {
-+ av_buffer_unref(&fr_ref);
-+ }
-+}
-+
-+AVZcEnvPtr av_rpi_zc_env_alloc(void)
-+{
-+ ZcEnv * const zc = av_mallocz(sizeof(ZcEnv));
-+ if (zc == NULL)
-+ {
-+ av_log(NULL, AV_LOG_ERROR, "av_rpi_zc_env_alloc: Context allocation failed\n");
-+ return NULL;
-+ }
-+
-+ zc_pool_init(&zc->pool);
-+ return zc;
-+}
-+
-+void av_rpi_zc_env_free(AVZcEnvPtr zc)
-+{
-+ if (zc != NULL)
-+ {
-+ zc_pool_destroy(&zc->pool); ;
-+ av_free(zc);
-+ }
-+}
-+
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s)
-+{
-+ return s->get_buffer2 == av_rpi_zc_get_buffer2;
-+}
-+
-+int av_rpi_zc_init(struct AVCodecContext * const s)
-+{
-+ if (av_rpi_zc_in_use(s))
-+ {
-+ ZcEnv * const zc = s->get_buffer_context;
-+ ++zc->refcount;
-+ }
-+ else
-+ {
-+ ZcEnv *const zc = av_rpi_zc_env_alloc();
-+ if (zc == NULL)
-+ {
-+ return AVERROR(ENOMEM);
-+ }
-+
-+ zc->refcount = 1;
-+ zc->old.get_buffer_context = s->get_buffer_context;
-+ zc->old.get_buffer2 = s->get_buffer2;
-+ zc->old.thread_safe_callbacks = s->thread_safe_callbacks;
-+
-+ s->get_buffer_context = zc;
-+ s->get_buffer2 = av_rpi_zc_get_buffer2;
-+ s->thread_safe_callbacks = 1;
-+ }
-+ return 0;
-+}
-+
-+void av_rpi_zc_uninit(struct AVCodecContext * const s)
-+{
-+ if (av_rpi_zc_in_use(s))
-+ {
-+ ZcEnv * const zc = s->get_buffer_context;
-+ if (--zc->refcount == 0)
-+ {
-+ s->get_buffer2 = zc->old.get_buffer2;
-+ s->get_buffer_context = zc->old.get_buffer_context;
-+ s->thread_safe_callbacks = zc->old.thread_safe_callbacks;
-+ av_rpi_zc_env_free(zc);
-+ }
-+ }
-+}
-+
-diff --git a/libavcodec/rpi_zc.h b/libavcodec/rpi_zc.h
-new file mode 100644
-index 0000000000..26fb3be999
---- /dev/null
-+++ b/libavcodec/rpi_zc.h
-@@ -0,0 +1,105 @@
-+#ifndef LIBAVCODEC_RPI_ZC_H
-+#define LIBAVCODEC_RPI_ZC_H
-+
-+// Zero-Copy frame code for RPi
-+// RPi needs Y/U/V planes to be contiguous for display. By default
-+// ffmpeg will allocate separated planes so a memcpy is needed before
-+// display. This code provides a method a making ffmpeg allocate a single
-+// bit of memory for the frame when can then be reference counted until
-+// display has finished with it.
-+
-+// Frame buffer number in which to stuff an 8-bit copy of a 16-bit frame
-+// 0 disables
-+// *** This option still in development
-+// Only works if SAO active
-+// Allocates buffers that are twice the required size
-+#define RPI_ZC_SAND_8_IN_10_BUF 0
-+
-+struct AVBufferRef;
-+struct AVFrame;
-+struct AVCodecContext;
-+enum AVPixelFormat;
-+
-+// "Opaque" pointer to whatever we are using as a buffer reference
-+typedef struct AVBufferRef * AVRpiZcRefPtr;
-+
-+struct AVZcEnv;
-+typedef struct AVZcEnv * AVZcEnvPtr;
-+
-+typedef struct AVRpiZcFrameGeometry
-+{
-+ unsigned int stride_y; // Luma stride (bytes)
-+ unsigned int height_y; // Luma height (lines)
-+ unsigned int stride_c; // Chroma stride (bytes)
-+ unsigned int height_c; // Chroma stride (lines)
-+ unsigned int planes_c; // Chroma plane count (U, V = 2, interleaved = 1)
-+ unsigned int stripes; // Number of stripes (sand)
-+ unsigned int bytes_per_pel;
-+} AVRpiZcFrameGeometry;
-+
-+
-+AVRpiZcFrameGeometry av_rpi_zc_frame_geometry(
-+ const int format,
-+ const unsigned int video_width, const unsigned int video_height);
-+
-+// Replacement fn for avctx->get_buffer2
-+// Should be set before calling avcodec_decode_open2
-+//
-+// N.B. in addition to to setting avctx->get_buffer2, avctx->refcounted_frames
-+// must be set to 1 as otherwise the buffer info is killed before being returned
-+// by avcodec_decode_video2. Note also that this means that the AVFrame that is
-+// returned must be manually derefed with av_frame_unref. This should be done
-+// after av_rpi_zc_ref has been called.
-+int av_rpi_zc_get_buffer2(struct AVCodecContext *s, AVFrame *frame, int flags);
-+
-+// Generate a ZC reference to the buffer(s) in this frame
-+// If the buffer doesn't appear to be one allocated by _get_buffer_2
-+// then the behaviour depends on maycopy:
-+// If maycopy=0 then return NULL
-+// If maycopy=1 && the src frame is in a form where we can easily copy
-+// the data, then allocate a new buffer and copy the data into it
-+// Otherwise return NULL
-+AVRpiZcRefPtr av_rpi_zc_ref(struct AVCodecContext * const s,
-+ const struct AVFrame * const frame, const enum AVPixelFormat expected_format, const int maycopy);
-+
-+// Get the vc_handle from the frame ref
-+// Returns -1 if ref doesn't look valid
-+int av_rpi_zc_vc_handle(const AVRpiZcRefPtr fr_ref);
-+// Get offset from the start of the memory referenced
-+// by the vc_handle to valid data
-+int av_rpi_zc_offset(const AVRpiZcRefPtr fr_ref);
-+// Length of buffer data
-+int av_rpi_zc_length(const AVRpiZcRefPtr fr_ref);
-+// Get the number of bytes allocated from the frame ref
-+// Returns 0 if ref doesn't look valid
-+int av_rpi_zc_numbytes(const AVRpiZcRefPtr fr_ref);
-+
-+// Unreference the buffer refed/allocated by _zc_ref
-+// If fr_ref is NULL then this will NOP
-+void av_rpi_zc_unref(AVRpiZcRefPtr fr_ref);
-+
-+// Allocate an environment for the buffer pool used by the ZC code
-+// This should be put in avctx->get_buffer_context so it can be found by
-+// av_rpi_zc_get_buffer2 when it is called from ffmpeg
-+AVZcEnvPtr av_rpi_zc_env_alloc(void);
-+
-+// Allocate the environment used by the ZC code
-+void av_rpi_zc_env_free(AVZcEnvPtr);
-+
-+// Test to see if the context is using zc (checks get_buffer2)
-+int av_rpi_zc_in_use(const struct AVCodecContext * const s);
-+
-+// Init ZC into a context
-+// There is nothing magic in this fn - it just packages setting
-+// get_buffer2 & get_buffer_context
-+int av_rpi_zc_init(struct AVCodecContext * const s);
-+
-+// Free ZC from a context
-+// There is nothing magic in this fn - it just packages unsetting
-+// get_buffer2 & get_buffer_context
-+void av_rpi_zc_uninit(struct AVCodecContext * const s);
-+
-+
-+
-+#endif
-+
-diff --git a/libavfilter/Makefile b/libavfilter/Makefile
-index bcd5d437ff..ccb49ec8c0 100644
---- a/libavfilter/Makefile
-+++ b/libavfilter/Makefile
-@@ -346,6 +346,7 @@ OBJS-$(CONFIG_TONEMAP_FILTER) += vf_tonemap.o
- OBJS-$(CONFIG_TRANSPOSE_FILTER) += vf_transpose.o
- OBJS-$(CONFIG_TRIM_FILTER) += trim.o
- OBJS-$(CONFIG_UNPREMULTIPLY_FILTER) += vf_premultiply.o framesync.o
-+OBJS-$(CONFIG_UNSAND_FILTER) += vf_unsand.o
- OBJS-$(CONFIG_UNSHARP_FILTER) += vf_unsharp.o
- OBJS-$(CONFIG_UNSHARP_OPENCL_FILTER) += vf_unsharp_opencl.o opencl.o \
- opencl/unsharp.o
-diff --git a/libavfilter/allfilters.c b/libavfilter/allfilters.c
-index 68b2992027..3b059fce4e 100644
---- a/libavfilter/allfilters.c
-+++ b/libavfilter/allfilters.c
-@@ -338,6 +338,7 @@ extern AVFilter ff_vf_transpose;
- extern AVFilter ff_vf_trim;
- extern AVFilter ff_vf_unpremultiply;
- extern AVFilter ff_vf_unsharp;
-+extern AVFilter ff_vf_unsand;
- extern AVFilter ff_vf_unsharp_opencl;
- extern AVFilter ff_vf_uspp;
- extern AVFilter ff_vf_vaguedenoiser;
-diff --git a/libavfilter/avfiltergraph.c b/libavfilter/avfiltergraph.c
-index 4cc6892404..9db92322a4 100644
---- a/libavfilter/avfiltergraph.c
-+++ b/libavfilter/avfiltergraph.c
-@@ -32,6 +32,9 @@
- #include "libavutil/internal.h"
- #include "libavutil/opt.h"
- #include "libavutil/pixdesc.h"
-+#if CONFIG_UNSAND_FILTER
-+#include "libavutil/rpi_sand_fns.h"
-+#endif
-
- #define FF_INTERNAL_FIELDS 1
- #include "framequeue.h"
-@@ -427,6 +430,19 @@ static int can_merge_formats(AVFilterFormats *a_arg,
- }
- }
-
-+#if CONFIG_UNSAND_FILTER
-+static int has_sand_format(const AVFilterFormats * const ff)
-+{
-+ int i;
-+ for (i = 0; i != ff->nb_formats; ++i) {
-+ if (av_rpi_is_sand_format(ff->formats[i])) {
-+ return 1;
-+ }
-+ }
-+ return 0;
-+}
-+#endif
-+
- /**
- * Perform one round of query_formats() and merging formats lists on the
- * filter graph.
-@@ -467,6 +483,7 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
- for (j = 0; j < filter->nb_inputs; j++) {
- AVFilterLink *link = filter->inputs[j];
- int convert_needed = 0;
-+ unsigned int extra_convert_tried = 0;
-
- if (!link)
- continue;
-@@ -514,11 +531,14 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
- )
- #undef MERGE_DISPATCH
-
-- if (convert_needed) {
-+ while (convert_needed) {
- AVFilterContext *convert;
- const AVFilter *filter;
- AVFilterLink *inlink, *outlink;
- char inst_name[30];
-+ int can_retry = 0;
-+
-+ convert_needed = 0;
-
- if (graph->disable_auto_convert) {
- av_log(log_ctx, AV_LOG_ERROR,
-@@ -531,19 +551,45 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
- /* couldn't merge format lists. auto-insert conversion filter */
- switch (link->type) {
- case AVMEDIA_TYPE_VIDEO:
-- if (!(filter = avfilter_get_by_name("scale"))) {
-- av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
-- "not present, cannot convert pixel formats.\n");
-- return AVERROR(EINVAL);
-- }
--
-- snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
-- scaler_count++);
-+#if CONFIG_UNSAND_FILTER
-+ // Only try each extra conversion once
-+ // The unsand output pad should never trigger has_sand_format
-+ // but it is better to be safe
-+ if ((extra_convert_tried & 1) == 0 && has_sand_format(link->in_formats)) {
-+ if (!(filter = avfilter_get_by_name("unsand"))) {
-+ av_log(log_ctx, AV_LOG_ERROR, "'unsand' filter "
-+ "not present, cannot convert pixel formats.\n");
-+ return AVERROR(EINVAL);
-+ }
-+
-+ snprintf(inst_name, sizeof(inst_name), "auto_unsand_%d",
-+ scaler_count++);
-+
-+ if ((ret = avfilter_graph_create_filter(&convert, filter,
-+ inst_name, "", NULL,
-+ graph)) < 0)
-+ return ret;
-
-- if ((ret = avfilter_graph_create_filter(&convert, filter,
-- inst_name, graph->scale_sws_opts, NULL,
-- graph)) < 0)
-- return ret;
-+ extra_convert_tried |= 1;
-+ can_retry = 1;
-+ }
-+ else
-+#endif
-+ {
-+ if (!(filter = avfilter_get_by_name("scale"))) {
-+ av_log(log_ctx, AV_LOG_ERROR, "'scale' filter "
-+ "not present, cannot convert pixel formats.\n");
-+ return AVERROR(EINVAL);
-+ }
-+
-+ snprintf(inst_name, sizeof(inst_name), "auto_scaler_%d",
-+ scaler_count++);
-+
-+ if ((ret = avfilter_graph_create_filter(&convert, filter,
-+ inst_name, graph->scale_sws_opts, NULL,
-+ graph)) < 0)
-+ return ret;
-+ }
- break;
- case AVMEDIA_TYPE_AUDIO:
- if (!(filter = avfilter_get_by_name("aresample"))) {
-@@ -585,9 +631,19 @@ static int query_formats(AVFilterGraph *graph, AVClass *log_ctx)
- av_assert0(outlink-> in_channel_layouts->refcount > 0);
- av_assert0(outlink->out_channel_layouts->refcount > 0);
- }
-- if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type) ||
-- !ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+ // If we have added an extra filter we must merge the input
-+ // side but we can have another go at the output
-+ if (!ff_merge_formats( inlink->in_formats, inlink->out_formats, inlink->type))
-+ ret = AVERROR(ENOSYS);
-+ else if (!ff_merge_formats(outlink->in_formats, outlink->out_formats, outlink->type))
-+ {
-+ if (can_retry) {
-+ link = outlink;
-+ convert_needed = 1;
-+ continue;
-+ }
- ret = AVERROR(ENOSYS);
-+ }
- if (inlink->type == AVMEDIA_TYPE_AUDIO &&
- (!ff_merge_samplerates(inlink->in_samplerates,
- inlink->out_samplerates) ||
-diff --git a/libavfilter/buffersrc.c b/libavfilter/buffersrc.c
-index cd56f8ca45..813a682aa1 100644
---- a/libavfilter/buffersrc.c
-+++ b/libavfilter/buffersrc.c
-@@ -207,7 +207,7 @@ static int av_buffersrc_add_frame_internal(AVFilterContext *ctx,
-
- switch (ctx->outputs[0]->type) {
- case AVMEDIA_TYPE_VIDEO:
-- CHECK_VIDEO_PARAM_CHANGE(ctx, s, frame->width, frame->height,
-+ CHECK_VIDEO_PARAM_CHANGE(ctx, s, av_frame_cropped_width(frame), av_frame_cropped_height(frame),
- frame->format);
- break;
- case AVMEDIA_TYPE_AUDIO:
-diff --git a/libavfilter/vf_unsand.c b/libavfilter/vf_unsand.c
-new file mode 100644
-index 0000000000..64578b7ac4
---- /dev/null
-+++ b/libavfilter/vf_unsand.c
-@@ -0,0 +1,232 @@
-+/*
-+ * Copyright (c) 2007 Bobby Bingham
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+/**
-+ * @file
-+ * format and noformat video filters
-+ */
-+
-+#include <string.h>
-+
-+#include "libavutil/internal.h"
-+#include "libavutil/mem.h"
-+#include "libavutil/pixdesc.h"
-+#include "libavutil/opt.h"
-+#include "libavutil/rpi_sand_fns.h"
-+
-+#include "avfilter.h"
-+#include "formats.h"
-+#include "internal.h"
-+#include "video.h"
-+
-+typedef struct UnsandContext {
-+ const AVClass *class;
-+} UnsandContext;
-+
-+static av_cold void uninit(AVFilterContext *ctx)
-+{
-+// UnsandContext *s = ctx->priv;
-+}
-+
-+static av_cold int init(AVFilterContext *ctx)
-+{
-+// UnsandContext *s = ctx->priv;
-+
-+ return 0;
-+}
-+
-+
-+static int filter_frame(AVFilterLink *link, AVFrame *in)
-+{
-+ AVFilterLink * const outlink = link->dst->outputs[0];
-+ AVFrame *out = NULL;
-+ int rv = 0;
-+
-+ if (outlink->format == in->format) {
-+ // If nothing to do then do nothing
-+ out = in;
-+ }
-+ else
-+ {
-+ if ((out = ff_get_video_buffer(outlink, av_frame_cropped_width(in), av_frame_cropped_height(in))) == NULL)
-+ {
-+ rv = AVERROR(ENOMEM);
-+ goto fail;
-+ }
-+ if (av_rpi_sand_to_planar_frame(out, in) != 0)
-+ {
-+ rv = -1;
-+ goto fail;
-+ }
-+
-+ av_frame_free(&in);
-+ }
-+
-+ return ff_filter_frame(outlink, out);
-+
-+fail:
-+ av_frame_free(&out);
-+ av_frame_free(&in);
-+ return rv;
-+}
-+
-+#if 0
-+static void dump_fmts(const AVFilterFormats * fmts)
-+{
-+ int i;
-+ if (fmts== NULL) {
-+ printf("NULL\n");
-+ return;
-+ }
-+ for (i = 0; i < fmts->nb_formats; ++i) {
-+ printf(" %d", fmts->formats[i]);
-+ }
-+ printf("\n");
-+}
-+#endif
-+
-+static int query_formats(AVFilterContext *ctx)
-+{
-+// UnsandContext *s = ctx->priv;
-+ int ret;
-+
-+ // If we aren't connected at both ends then just do nothing
-+ if (ctx->inputs[0] == NULL || ctx->outputs[0] == NULL)
-+ return 0;
-+
-+// printf("Unsand: %s in: ", __func__);
-+// dump_fmts(ctx->inputs[0]->in_formats);
-+// printf("Unsand: %s out: ", __func__);
-+// dump_fmts(ctx->outputs[0]->out_formats);
-+
-+ // Our output formats depend on our input formats and we can't/don't
-+ // want to convert between bit depths so we need to wait for the source
-+ // to have an opinion before we do
-+ if (ctx->inputs[0]->in_formats == NULL)
-+ return AVERROR(EAGAIN);
-+
-+ // Accept anything
-+ if (ctx->inputs[0]->out_formats == NULL &&
-+ (ret = ff_formats_ref(ctx->inputs[0]->in_formats, &ctx->inputs[0]->out_formats)) < 0)
-+ return ret;
-+
-+ // Filter out sand formats
-+
-+ // Generate a container if we don't already have one
-+ if (ctx->outputs[0]->in_formats == NULL)
-+ {
-+ // Somewhat rubbish way of ensuring we have a good structure
-+ const static enum AVPixelFormat out_fmts[] = {AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P, AV_PIX_FMT_NONE};
-+ AVFilterFormats *formats = ff_make_format_list(out_fmts);
-+
-+ if (formats == NULL)
-+ return AVERROR(ENOMEM);
-+ if ((ret = ff_formats_ref(formats, &ctx->outputs[0]->in_formats)) < 0)
-+ return ret;
-+ }
-+
-+ // Replace old format list with new filtered list derived from what our
-+ // input says it can do
-+ {
-+ const AVFilterFormats * const src_ff = ctx->inputs[0]->out_formats;
-+ AVFilterFormats * const dst_ff = ctx->outputs[0]->in_formats;
-+ enum AVPixelFormat *dst_fmts = av_malloc(sizeof(enum AVPixelFormat) * src_ff->nb_formats);
-+ int i;
-+ int n = 0;
-+ int seen_420p = 0;
-+ int seen_420p10 = 0;
-+
-+ for (i = 0; i < src_ff->nb_formats; ++i) {
-+ const enum AVPixelFormat f = src_ff->formats[i];
-+
-+ switch (f){
-+ case AV_PIX_FMT_YUV420P:
-+ case AV_PIX_FMT_SAND128:
-+ if (!seen_420p) {
-+ seen_420p = 1;
-+ dst_fmts[n++] = AV_PIX_FMT_YUV420P;
-+ }
-+ break;
-+ case AV_PIX_FMT_SAND64_10:
-+ case AV_PIX_FMT_YUV420P10:
-+ if (!seen_420p10) {
-+ seen_420p10 = 1;
-+ dst_fmts[n++] = AV_PIX_FMT_YUV420P10;
-+ }
-+ break;
-+ default:
-+ dst_fmts[n++] = f;
-+ break;
-+ }
-+ }
-+
-+ av_freep(&dst_ff->formats);
-+ dst_ff->formats = dst_fmts;
-+ dst_ff->nb_formats = n;
-+ }
-+
-+// printf("Unsand: %s calc: ", __func__);
-+// dump_fmts(ctx->outputs[0]->in_formats);
-+
-+ return 0;
-+}
-+
-+
-+#define OFFSET(x) offsetof(UnsandContext, x)
-+static const AVOption unsand_options[] = {
-+ { NULL }
-+};
-+
-+
-+AVFILTER_DEFINE_CLASS(unsand);
-+
-+static const AVFilterPad avfilter_vf_unsand_inputs[] = {
-+ {
-+ .name = "default",
-+ .type = AVMEDIA_TYPE_VIDEO,
-+ .filter_frame = filter_frame,
-+ },
-+ { NULL }
-+};
-+
-+static const AVFilterPad avfilter_vf_unsand_outputs[] = {
-+ {
-+ .name = "default",
-+ .type = AVMEDIA_TYPE_VIDEO
-+ },
-+ { NULL }
-+};
-+
-+AVFilter ff_vf_unsand = {
-+ .name = "unsand",
-+ .description = NULL_IF_CONFIG_SMALL("Convert sand pix fmt to yuv"),
-+
-+ .init = init,
-+ .uninit = uninit,
-+
-+ .query_formats = query_formats,
-+
-+ .priv_size = sizeof(UnsandContext),
-+ .priv_class = &unsand_class,
-+
-+ .inputs = avfilter_vf_unsand_inputs,
-+ .outputs = avfilter_vf_unsand_outputs,
-+};
-+
-diff --git a/libavformat/utils.c b/libavformat/utils.c
-index c25eab4d49..4db44315c7 100644
---- a/libavformat/utils.c
-+++ b/libavformat/utils.c
-@@ -3005,6 +3005,40 @@ static int has_codec_parameters(AVStream *st, const char **errmsg_ptr)
- return 1;
- }
-
-+#if CONFIG_HEVC_RPI_DECODER && CONFIG_HEVC_DECODER
-+// This should be quite general purpose but avoid possible conflicts
-+// by limiting usage to cases wehere we know it works.
-+static int try_fallback_decoder(AVCodecContext * const avctx, const AVCodec *const old_codec, AVDictionary ** const opts)
-+{
-+ // Only try fallback if we know it is supported (HEVC only)
-+ const AVCodec *const new_codec = old_codec->id != AV_CODEC_ID_HEVC ? NULL :
-+ avcodec_find_decoder_by_id_and_fmt(old_codec->id, AV_PIX_FMT_NONE);
-+ int err;
-+
-+ // Failed to find fallback or we are already at the fallback
-+ if (new_codec == NULL || new_codec == old_codec)
-+ {
-+ return AVERROR_DECODER_NOT_FOUND;
-+ }
-+
-+ // * This may be dodgy - header says to not use this fn,
-+ // especially if we are going to reopen the context...
-+ // (but it does seem to work for our cases)
-+ if (avcodec_is_open(avctx)) {
-+ avcodec_close(avctx);
-+ }
-+
-+ if ((err = avcodec_open2(avctx, new_codec, opts)) < 0)
-+ {
-+ return err;
-+ }
-+
-+ return 0;
-+}
-+#else
-+#define try_fallback_decoder(avctx, old_codec, opts) (AVERROR_DECODER_NOT_FOUND)
-+#endif
-+
- /* returns 1 or 0 if or if not decoded data was returned, or a negative error */
- static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
- AVDictionary **options)
-@@ -3039,7 +3073,11 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
- av_dict_set(options ? options : &thread_opt, "threads", "1", 0);
- if (s->codec_whitelist)
- av_dict_set(options ? options : &thread_opt, "codec_whitelist", s->codec_whitelist, 0);
-- ret = avcodec_open2(avctx, codec, options ? options : &thread_opt);
-+ if ((ret = avcodec_open2(avctx, codec, options ? options : &thread_opt)) == AVERROR_DECODER_NOT_FOUND)
-+ {
-+ // Try fallback if if looks worth a try
-+ ret = try_fallback_decoder(avctx, codec, options ? options : &thread_opt);
-+ }
- if (!options)
- av_dict_free(&thread_opt);
- if (ret < 0) {
-@@ -3070,6 +3108,14 @@ static int try_decode_frame(AVFormatContext *s, AVStream *st, AVPacket *avpkt,
- if (avctx->codec_type == AVMEDIA_TYPE_VIDEO ||
- avctx->codec_type == AVMEDIA_TYPE_AUDIO) {
- ret = avcodec_send_packet(avctx, &pkt);
-+
-+ // If we are going to want to fall back we should know here
-+ if (ret == AVERROR_DECODER_NOT_FOUND) {
-+ if ((ret = try_fallback_decoder(avctx, avctx->codec, options)) < 0)
-+ break;
-+ continue;
-+ }
-+
- if (ret < 0 && ret != AVERROR(EAGAIN) && ret != AVERROR_EOF)
- break;
- if (ret >= 0)
-@@ -3663,9 +3709,20 @@ FF_ENABLE_DEPRECATION_WARNINGS
- // Try to just open decoders, in case this is enough to get parameters.
- if (!has_codec_parameters(st, NULL) && st->request_probe <= 0) {
- if (codec && !avctx->codec)
-- if (avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt) < 0)
-- av_log(ic, AV_LOG_WARNING,
-- "Failed to open codec in %s\n",__FUNCTION__);
-+ {
-+ int err;
-+
-+ if ((err = avcodec_open2(avctx, codec, options ? &options[i] : &thread_opt)) < 0)
-+ {
-+ if (err == AVERROR_DECODER_NOT_FOUND) {
-+ err = try_fallback_decoder(avctx, codec, options ? &options[i] : &thread_opt);
-+ }
-+ if (err < 0) {
-+ av_log(ic, AV_LOG_WARNING,
-+ "Failed to open codec in %s\n",__FUNCTION__);
-+ }
-+ }
-+ }
- }
- if (!options)
- av_dict_free(&thread_opt);
-diff --git a/libavutil/Makefile b/libavutil/Makefile
-index a63ba523c9..4f9a19e800 100644
---- a/libavutil/Makefile
-+++ b/libavutil/Makefile
-@@ -164,6 +164,7 @@ OBJS-$(CONFIG_QSV) += hwcontext_qsv.o
- OBJS-$(CONFIG_LIBDRM) += hwcontext_drm.o
- OBJS-$(CONFIG_LZO) += lzo.o
- OBJS-$(CONFIG_OPENCL) += hwcontext_opencl.o
-+OBJS-$(CONFIG_SAND) += rpi_sand_fns.o
- OBJS-$(CONFIG_VAAPI) += hwcontext_vaapi.o
- OBJS-$(CONFIG_VIDEOTOOLBOX) += hwcontext_videotoolbox.o
- OBJS-$(CONFIG_VDPAU) += hwcontext_vdpau.o
-diff --git a/libavutil/arm/Makefile b/libavutil/arm/Makefile
-index 5da44b0542..b74b7c4e2f 100644
---- a/libavutil/arm/Makefile
-+++ b/libavutil/arm/Makefile
-@@ -6,3 +6,4 @@ VFP-OBJS += arm/float_dsp_init_vfp.o \
-
- NEON-OBJS += arm/float_dsp_init_neon.o \
- arm/float_dsp_neon.o \
-+ arm/rpi_sand_neon.o \
-diff --git a/libavutil/arm/rpi_sand_neon.S b/libavutil/arm/rpi_sand_neon.S
-new file mode 100644
-index 0000000000..dbffdaefa4
---- /dev/null
-+++ b/libavutil/arm/rpi_sand_neon.S
-@@ -0,0 +1,40 @@
-+#include "libavutil/arm/asm.S"
-+
-+@ void rpi_sand128b_stripe_to_8_10(
-+@ uint8_t * dest, [r0]
-+@ const uint8_t * src1, [r1]
-+@ const uint8_t * src2, [r2]
-+@ unsigned int lines); [r3]
-+
-+.macro stripe2_to_8, bit_depth
-+ vpush {q4-q7}
-+1:
-+ vldm r1!, {q0-q7}
-+ subs r3, #1
-+ vldm r2!, {q8-q15}
-+ vqrshrn.u16 d0, q0, #\bit_depth - 8
-+ vqrshrn.u16 d1, q1, #\bit_depth - 8
-+ vqrshrn.u16 d2, q2, #\bit_depth - 8
-+ vqrshrn.u16 d3, q3, #\bit_depth - 8
-+ vqrshrn.u16 d4, q4, #\bit_depth - 8
-+ vqrshrn.u16 d5, q5, #\bit_depth - 8
-+ vqrshrn.u16 d6, q6, #\bit_depth - 8
-+ vqrshrn.u16 d7, q7, #\bit_depth - 8
-+ vqrshrn.u16 d8, q8, #\bit_depth - 8
-+ vqrshrn.u16 d9, q9, #\bit_depth - 8
-+ vqrshrn.u16 d10, q10, #\bit_depth - 8
-+ vqrshrn.u16 d11, q11, #\bit_depth - 8
-+ vqrshrn.u16 d12, q12, #\bit_depth - 8
-+ vqrshrn.u16 d13, q13, #\bit_depth - 8
-+ vqrshrn.u16 d14, q14, #\bit_depth - 8
-+ vqrshrn.u16 d15, q15, #\bit_depth - 8
-+ vstm r0!, {q0-q7}
-+ bne 1b
-+ vpop {q4-q7}
-+ bx lr
-+.endm
-+
-+function rpi_sand128b_stripe_to_8_10, export=1
-+ stripe2_to_8 10
-+endfunc
-+
-diff --git a/libavutil/buffer.c b/libavutil/buffer.c
-index 8d1aa5fa84..649876db77 100644
---- a/libavutil/buffer.c
-+++ b/libavutil/buffer.c
-@@ -355,3 +355,9 @@ AVBufferRef *av_buffer_pool_get(AVBufferPool *pool)
-
- return ret;
- }
-+
-+// Return the opaque for the underlying frame (gives us a GPU_MEM_PTR_T)
-+void *av_buffer_pool_opaque(AVBufferRef *ref) {
-+ BufferPoolEntry *buf = av_buffer_get_opaque(ref);
-+ return buf->opaque;
-+}
-diff --git a/libavutil/buffer.h b/libavutil/buffer.h
-index 73b6bd0b14..d907de3f1c 100644
---- a/libavutil/buffer.h
-+++ b/libavutil/buffer.h
-@@ -284,6 +284,9 @@ void av_buffer_pool_uninit(AVBufferPool **pool);
- */
- AVBufferRef *av_buffer_pool_get(AVBufferPool *pool);
-
-+// Return the opaque for the underlying frame
-+void *av_buffer_pool_opaque(AVBufferRef *ref);
-+
- /**
- * @}
- */
-diff --git a/libavutil/frame.c b/libavutil/frame.c
-index 00215ac29a..d068f437e7 100644
---- a/libavutil/frame.c
-+++ b/libavutil/frame.c
-@@ -16,6 +16,8 @@
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
-
-+#include "config.h"
-+
- #include "channel_layout.h"
- #include "avassert.h"
- #include "buffer.h"
-@@ -25,6 +27,9 @@
- #include "imgutils.h"
- #include "mem.h"
- #include "samplefmt.h"
-+#if CONFIG_SAND
-+#include "rpi_sand_fns.h"
-+#endif
-
- #if FF_API_FRAME_GET_SET
- MAKE_ACCESSORS(AVFrame, frame, int64_t, best_effort_timestamp)
-@@ -885,6 +890,12 @@ int av_frame_apply_cropping(AVFrame *frame, int flags)
- (frame->crop_top + frame->crop_bottom) >= frame->height)
- return AVERROR(ERANGE);
-
-+#if CONFIG_SAND
-+ // Sand cannot be cropped - do not try
-+ if (av_rpi_is_sand_format(frame->format))
-+ return 0;
-+#endif
-+
- desc = av_pix_fmt_desc_get(frame->format);
- if (!desc)
- return AVERROR_BUG;
-diff --git a/libavutil/frame.h b/libavutil/frame.h
-index 9d57d6ce66..1ade7bd707 100644
---- a/libavutil/frame.h
-+++ b/libavutil/frame.h
-@@ -886,6 +886,16 @@ int av_frame_apply_cropping(AVFrame *frame, int flags);
- */
- const char *av_frame_side_data_name(enum AVFrameSideDataType type);
-
-+
-+static inline int av_frame_cropped_width(const AVFrame * const frame)
-+{
-+ return frame->width - (frame->crop_left + frame->crop_right);
-+}
-+static inline int av_frame_cropped_height(const AVFrame * const frame)
-+{
-+ return frame->height - (frame->crop_top + frame->crop_bottom);
-+}
-+
- /**
- * @}
- */
-diff --git a/libavutil/pixdesc.c b/libavutil/pixdesc.c
-index 8ed52751c1..71d6dd4250 100644
---- a/libavutil/pixdesc.c
-+++ b/libavutil/pixdesc.c
-@@ -2185,6 +2185,30 @@ static const AVPixFmtDescriptor av_pix_fmt_descriptors[AV_PIX_FMT_NB] = {
- .name = "opencl",
- .flags = AV_PIX_FMT_FLAG_HWACCEL,
- },
-+ [AV_PIX_FMT_SAND128] = {
-+ .name = "sand128",
-+ .nb_components = 3,
-+ .log2_chroma_w = 1,
-+ .log2_chroma_h = 1,
-+ .comp = {
-+ { 0, 1, 0, 0, 8, 0, 7, 1 }, /* Y */
-+ { 1, 2, 0, 0, 8, 1, 7, 1 }, /* U */
-+ { 1, 2, 1, 0, 8, 1, 7, 2 }, /* V */
-+ },
-+ .flags = 0,
-+ },
-+ [AV_PIX_FMT_SAND64_10] = {
-+ .name = "sand64_10",
-+ .nb_components = 3,
-+ .log2_chroma_w = 1,
-+ .log2_chroma_h = 1,
-+ .comp = {
-+ { 0, 2, 0, 0, 10, 0, 9, 1 }, /* Y */
-+ { 1, 4, 0, 0, 10, 1, 9, 1 }, /* U */
-+ { 1, 4, 1, 0, 10, 1, 9, 2 }, /* V */
-+ },
-+ .flags = 0,
-+ },
- };
- #if FF_API_PLUS1_MINUS1
- FF_ENABLE_DEPRECATION_WARNINGS
-diff --git a/libavutil/pixfmt.h b/libavutil/pixfmt.h
-index e184a56672..1078c192a6 100644
---- a/libavutil/pixfmt.h
-+++ b/libavutil/pixfmt.h
-@@ -330,6 +330,11 @@ enum AVPixelFormat {
- */
- AV_PIX_FMT_OPENCL,
-
-+ // RPI - not on ifdef so can be got at by calling progs
-+ AV_PIX_FMT_SAND128, ///< 4:2:0 8-bit 128x*Y stripe, 64x*UV stripe, then next x stripe, mysterious padding
-+ AV_PIX_FMT_SAND64_10, ///< 4:2:0 10-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+ AV_PIX_FMT_SAND64_16, ///< 4:2:0 16-bit 64x*Y stripe, 32x*UV stripe, then next x stripe, mysterious padding
-+
- AV_PIX_FMT_NB ///< number of pixel formats, DO NOT USE THIS if you want to link with shared libav* because the number of formats might differ between versions
- };
-
-diff --git a/libavutil/rpi_sand_fn_pw.h b/libavutil/rpi_sand_fn_pw.h
-new file mode 100644
-index 0000000000..52d52a2a83
---- /dev/null
-+++ b/libavutil/rpi_sand_fn_pw.h
-@@ -0,0 +1,182 @@
-+// * Included twice from rpi_sand_fn with different PW
-+
-+#define STRCAT(x,y) x##y
-+
-+#if PW == 1
-+#define pixel uint8_t
-+#define FUNC(f) STRCAT(f, 8)
-+#elif PW == 2
-+#define pixel uint16_t
-+#define FUNC(f) STRCAT(f, 16)
-+#else
-+#error Unexpected PW
-+#endif
-+
-+// Fetches a single patch - offscreen fixup not done here
-+// w <= stride1
-+// unclipped
-+void FUNC(av_rpi_sand_to_planar_y)(uint8_t * dst, const unsigned int dst_stride,
-+ const uint8_t * src,
-+ unsigned int stride1, unsigned int stride2,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h)
-+{
-+ const unsigned int x = _x;
-+ const unsigned int w = _w;
-+ const unsigned int mask = stride1 - 1;
-+
-+ if ((x & ~mask) == ((x + w) & ~mask)) {
-+ // All in one sand stripe
-+ const uint8_t * p = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p += stride1) {
-+ memcpy(dst, p, w);
-+ }
-+ }
-+ else
-+ {
-+ // Two+ stripe
-+ const unsigned int sstride = stride1 * stride2;
-+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+ const uint8_t * p2 = p1 + sstride - (x & mask);
-+ const unsigned int w1 = stride1 - (x & mask);
-+ const unsigned int w3 = (x + w) & mask;
-+ const unsigned int w2 = w - (w1 + w3);
-+
-+ for (unsigned int i = 0; i != h; ++i, dst += dst_stride, p1 += stride1, p2 += stride1) {
-+ unsigned int j;
-+ const uint8_t * p = p2;
-+ uint8_t * d = dst;
-+ memcpy(d, p1, w1);
-+ d += w1;
-+ for (j = 0; j < w2; j += stride1, d += stride1, p += sstride) {
-+ memcpy(d, p, stride1);
-+ }
-+ memcpy(d, p, w3);
-+ }
-+ }
-+}
-+
-+// x & w in bytes but not of interleave (i.e. offset = x*2 for U&V)
-+
-+void FUNC(av_rpi_sand_to_planar_c)(uint8_t * dst_u, const unsigned int dst_stride_u,
-+ uint8_t * dst_v, const unsigned int dst_stride_v,
-+ const uint8_t * src,
-+ unsigned int stride1, unsigned int stride2,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h)
-+{
-+ const unsigned int x = _x * 2;
-+ const unsigned int w = _w * 2;
-+ const unsigned int mask = stride1 - 1;
-+
-+ if ((x & ~mask) == ((x + w) & ~mask)) {
-+ // All in one sand stripe
-+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1) {
-+ pixel * du = (pixel *)dst_u;
-+ pixel * dv = (pixel *)dst_v;
-+ const pixel * p = (const pixel *)p1;
-+ for (unsigned int k = 0; k < w; k += 2 * PW) {
-+ *du++ = *p++;
-+ *dv++ = *p++;
-+ }
-+ }
-+ }
-+ else
-+ {
-+ // Two+ stripe
-+ const unsigned int sstride = stride1 * stride2;
-+ const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+ const uint8_t * p1 = src + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+ const uint8_t * p2 = p1 + sstride - (x & mask);
-+ const unsigned int w1 = stride1 - (x & mask);
-+ const unsigned int w3 = (x + w) & mask;
-+ const unsigned int w2 = w - (w1 + w3);
-+
-+ for (unsigned int i = 0; i != h; ++i, dst_u += dst_stride_u, dst_v += dst_stride_v, p1 += stride1, p2 += stride1) {
-+ unsigned int j;
-+ const pixel * p = (const pixel *)p1;
-+ pixel * du = (pixel *)dst_u;
-+ pixel * dv = (pixel *)dst_v;
-+ for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+ *du++ = *p++;
-+ *dv++ = *p++;
-+ }
-+ for (j = 0, p = (const pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+ *du++ = *p++;
-+ *dv++ = *p++;
-+ }
-+ }
-+ for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+ *du++ = *p++;
-+ *dv++ = *p++;
-+ }
-+ }
-+ }
-+}
-+
-+void FUNC(av_rpi_planar_to_sand_c)(uint8_t * dst_c,
-+ unsigned int stride1, unsigned int stride2,
-+ const uint8_t * src_u, const unsigned int src_stride_u,
-+ const uint8_t * src_v, const unsigned int src_stride_v,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h)
-+{
-+ const unsigned int x = _x * 2;
-+ const unsigned int w = _w * 2;
-+ const unsigned int mask = stride1 - 1;
-+ if ((x & ~mask) == ((x + w) & ~mask)) {
-+ // All in one sand stripe
-+ uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1) {
-+ const pixel * su = (const pixel *)src_u;
-+ const pixel * sv = (const pixel *)src_v;
-+ pixel * p = (pixel *)p1;
-+ for (unsigned int k = 0; k < w; k += 2 * PW) {
-+ *p++ = *su++;
-+ *p++ = *sv++;
-+ }
-+ }
-+ }
-+ else
-+ {
-+ // Two+ stripe
-+ const unsigned int sstride = stride1 * stride2;
-+ const unsigned int sstride_p = (sstride - stride1) / PW;
-+
-+ const uint8_t * p1 = dst_c + (x & mask) + y * stride1 + (x & ~mask) * stride2;
-+ const uint8_t * p2 = p1 + sstride - (x & mask);
-+ const unsigned int w1 = stride1 - (x & mask);
-+ const unsigned int w3 = (x + w) & mask;
-+ const unsigned int w2 = w - (w1 + w3);
-+
-+ for (unsigned int i = 0; i != h; ++i, src_u += src_stride_u, src_v += src_stride_v, p1 += stride1, p2 += stride1) {
-+ unsigned int j;
-+ const pixel * su = (const pixel *)src_u;
-+ const pixel * sv = (const pixel *)src_v;
-+ pixel * p = (pixel *)p1;
-+ for (unsigned int k = 0; k < w1; k += 2 * PW) {
-+ *p++ = *su++;
-+ *p++ = *sv++;
-+ }
-+ for (j = 0, p = (pixel *)p2; j < w2; j += stride1, p += sstride_p) {
-+ for (unsigned int k = 0; k < stride1; k += 2 * PW) {
-+ *p++ = *su++;
-+ *p++ = *sv++;
-+ }
-+ }
-+ for (unsigned int k = 0; k < w3; k += 2 * PW) {
-+ *p++ = *su++;
-+ *p++ = *sv++;
-+ }
-+ }
-+ }
-+}
-+
-+
-+#undef pixel
-+#undef STRCAT
-+#undef FUNC
-+
-diff --git a/libavutil/rpi_sand_fns.c b/libavutil/rpi_sand_fns.c
-new file mode 100644
-index 0000000000..3e31ef77ec
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.c
-@@ -0,0 +1,151 @@
-+#include "config.h"
-+#include <stdint.h>
-+#include <string.h>
-+#include "rpi_sand_fns.h"
-+#include "avassert.h"
-+#include "frame.h"
-+
-+#define PW 1
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#define PW 2
-+#include "rpi_sand_fn_pw.h"
-+#undef PW
-+
-+#if HAVE_NEON
-+void rpi_sand128b_stripe_to_8_10(uint8_t * dest, const uint8_t * src1, const uint8_t * src2, unsigned int lines);
-+#endif
-+
-+#if 1
-+// Simple round
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+ const unsigned int rnd = (1 << shr) >> 1;
-+ const uint16_t * src = (const uint16_t *)_src;
-+
-+ for (; n != 0; --n) {
-+ *dst++ = (*src++ + rnd) >> shr;
-+ }
-+}
-+#else
-+// Dithered variation
-+static void cpy16_to_8(uint8_t * dst, const uint8_t * _src, unsigned int n, const unsigned int shr)
-+{
-+ unsigned int rnd = (1 << shr) >> 1;
-+ const unsigned int mask = ((1 << shr) - 1);
-+ const uint16_t * src = (const uint16_t *)_src;
-+
-+ for (; n != 0; --n) {
-+ rnd = *src++ + (rnd & mask);
-+ *dst++ = rnd >> shr;
-+ }
-+}
-+#endif
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+ unsigned int w, unsigned int h, const unsigned int shr)
-+{
-+ const unsigned int n = dst_stride1 / 2;
-+ unsigned int j;
-+
-+ // This is true for our current layouts
-+ av_assert0(dst_stride1 == src_stride1);
-+
-+ // As we have the same stride1 for src & dest and src is wider than dest
-+ // then if we loop on src we can always write contiguously to dest
-+ // We make no effort to copy an exact width - round up to nearest src stripe
-+ // as we will always have storage in dest for that
-+
-+#if HAVE_NEON
-+ if (shr == 3 && src_stride1 == 128) {
-+ for (j = 0; j + n < w; j += dst_stride1) {
-+ uint8_t * d = dst + j * dst_stride2;
-+ const uint8_t * s1 = src + j * 2 * src_stride2;
-+ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+ rpi_sand128b_stripe_to_8_10(d, s1, s2, h);
-+ }
-+ }
-+ else
-+#endif
-+ {
-+ for (j = 0; j + n < w; j += dst_stride1) {
-+ uint8_t * d = dst + j * dst_stride2;
-+ const uint8_t * s1 = src + j * 2 * src_stride2;
-+ const uint8_t * s2 = s1 + src_stride1 * src_stride2;
-+
-+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, s2 += src_stride1, d += dst_stride1) {
-+ cpy16_to_8(d, s1, n, shr);
-+ cpy16_to_8(d + n, s2, n, shr);
-+ }
-+ }
-+ }
-+
-+ // Fix up a trailing dest half stripe
-+ if (j < w) {
-+ uint8_t * d = dst + j * dst_stride2;
-+ const uint8_t * s1 = src + j * 2 * src_stride2;
-+
-+ for (unsigned int i = 0; i != h; ++i, s1 += src_stride1, d += dst_stride1) {
-+ cpy16_to_8(d, s1, n, shr);
-+ }
-+ }
-+}
-+
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src)
-+{
-+ const int w = av_frame_cropped_width(src);
-+ const int h = av_frame_cropped_height(src);
-+ const int x = src->crop_left;
-+ const int y = src->crop_top;
-+
-+ // We will crop as part of the conversion
-+ dst->crop_top = 0;
-+ dst->crop_left = 0;
-+ dst->crop_bottom = 0;
-+ dst->crop_right = 0;
-+
-+ switch (src->format){
-+ case AV_PIX_FMT_SAND128:
-+ switch (dst->format){
-+ case AV_PIX_FMT_YUV420P:
-+ av_rpi_sand_to_planar_y8(dst->data[0], dst->linesize[0],
-+ src->data[0],
-+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+ x, y, w, h);
-+ av_rpi_sand_to_planar_c8(dst->data[1], dst->linesize[1],
-+ dst->data[2], dst->linesize[2],
-+ src->data[1],
-+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+ x/2, y/2, w/2, h/2);
-+ break;
-+ default:
-+ return -1;
-+ }
-+ break;
-+ case AV_PIX_FMT_SAND64_10:
-+ switch (dst->format){
-+ case AV_PIX_FMT_YUV420P10:
-+ av_rpi_sand_to_planar_y16(dst->data[0], dst->linesize[0],
-+ src->data[0],
-+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+ x*2, y, w*2, h);
-+ av_rpi_sand_to_planar_c16(dst->data[1], dst->linesize[1],
-+ dst->data[2], dst->linesize[2],
-+ src->data[1],
-+ av_rpi_sand_frame_stride1(src), av_rpi_sand_frame_stride2(src),
-+ x, y/2, w, h/2);
-+ break;
-+ default:
-+ return -1;
-+ }
-+ break;
-+ default:
-+ return -1;
-+ }
-+
-+ return av_frame_copy_props(dst, src);
-+}
-diff --git a/libavutil/rpi_sand_fns.h b/libavutil/rpi_sand_fns.h
-new file mode 100644
-index 0000000000..1f50b68ea8
---- /dev/null
-+++ b/libavutil/rpi_sand_fns.h
-@@ -0,0 +1,136 @@
-+#ifndef AVUTIL_RPI_SAND_FNS
-+#define AVUTIL_RPI_SAND_FNS
-+
-+#include "libavutil/frame.h"
-+
-+// For all these fns _x & _w are measured as coord * PW
-+// For the C fns coords are in chroma pels (so luma / 2)
-+// Strides are in bytes
-+
-+void av_rpi_sand_to_planar_y8(uint8_t * dst, const unsigned int dst_stride,
-+ const uint8_t * src,
-+ unsigned int stride1, unsigned int stride2,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_y16(uint8_t * dst, const unsigned int dst_stride,
-+ const uint8_t * src,
-+ unsigned int stride1, unsigned int stride2,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h);
-+
-+void av_rpi_sand_to_planar_c8(uint8_t * dst_u, const unsigned int dst_stride_u,
-+ uint8_t * dst_v, const unsigned int dst_stride_v,
-+ const uint8_t * src,
-+ unsigned int stride1, unsigned int stride2,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h);
-+void av_rpi_sand_to_planar_c16(uint8_t * dst_u, const unsigned int dst_stride_u,
-+ uint8_t * dst_v, const unsigned int dst_stride_v,
-+ const uint8_t * src,
-+ unsigned int stride1, unsigned int stride2,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h);
-+
-+void av_rpi_planar_to_sand_c8(uint8_t * dst_c,
-+ unsigned int stride1, unsigned int stride2,
-+ const uint8_t * src_u, const unsigned int src_stride_u,
-+ const uint8_t * src_v, const unsigned int src_stride_v,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h);
-+void av_rpi_planar_to_sand_c16(uint8_t * dst_c,
-+ unsigned int stride1, unsigned int stride2,
-+ const uint8_t * src_u, const unsigned int src_stride_u,
-+ const uint8_t * src_v, const unsigned int src_stride_v,
-+ unsigned int _x, unsigned int y,
-+ unsigned int _w, unsigned int h);
-+
-+// w/h in pixels
-+void av_rpi_sand16_to_sand8(uint8_t * dst, const unsigned int dst_stride1, const unsigned int dst_stride2,
-+ const uint8_t * src, const unsigned int src_stride1, const unsigned int src_stride2,
-+ unsigned int w, unsigned int h, const unsigned int shr);
-+
-+
-+// dst must contain required pixel format & allocated data buffers
-+// Cropping on the src buffer will be honoured and dst crop will be set to zero
-+int av_rpi_sand_to_planar_frame(AVFrame * const dst, const AVFrame * const src);
-+
-+
-+static inline unsigned int av_rpi_sand_frame_stride1(const AVFrame * const frame)
-+{
-+#ifdef RPI_ZC_SAND128_ONLY
-+ // If we are sure we only only support 128 byte sand formats replace the
-+ // var with a constant which should allow for better optimisation
-+ return 128;
-+#else
-+ return frame->linesize[0];
-+#endif
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_stride2(const AVFrame * const frame)
-+{
-+ return frame->linesize[3];
-+}
-+
-+
-+static inline int av_rpi_is_sand_format(const int format)
-+{
-+ return (format >= AV_PIX_FMT_SAND128 && format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_is_sand_frame(const AVFrame * const frame)
-+{
-+ return av_rpi_is_sand_format(frame->format);
-+}
-+
-+static inline int av_rpi_is_sand8_frame(const AVFrame * const frame)
-+{
-+ return (frame->format == AV_PIX_FMT_SAND128);
-+}
-+
-+static inline int av_rpi_is_sand16_frame(const AVFrame * const frame)
-+{
-+ return (frame->format >= AV_PIX_FMT_SAND64_10 && frame->format <= AV_PIX_FMT_SAND64_16);
-+}
-+
-+static inline int av_rpi_sand_frame_xshl(const AVFrame * const frame)
-+{
-+ return av_rpi_is_sand8_frame(frame) ? 0 : 1;
-+}
-+
-+// If x is measured in bytes (not pixels) then this works for sand64_16 as
-+// well as sand128 - but in the general case we work that out
-+
-+static inline unsigned int av_rpi_sand_frame_off_y(const AVFrame * const frame, const unsigned int x_y, const unsigned int y)
-+{
-+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+ const unsigned int x = x_y << av_rpi_sand_frame_xshl(frame);
-+ const unsigned int x1 = x & (stride1 - 1);
-+ const unsigned int x2 = x ^ x1;
-+
-+ return x1 + stride1 * y + stride2 * x2;
-+}
-+
-+static inline unsigned int av_rpi_sand_frame_off_c(const AVFrame * const frame, const unsigned int x_c, const unsigned int y_c)
-+{
-+ const unsigned int stride1 = av_rpi_sand_frame_stride1(frame);
-+ const unsigned int stride2 = av_rpi_sand_frame_stride2(frame);
-+ const unsigned int x = x_c << (av_rpi_sand_frame_xshl(frame) + 1);
-+ const unsigned int x1 = x & (stride1 - 1);
-+ const unsigned int x2 = x ^ x1;
-+
-+ return x1 + stride1 * y_c + stride2 * x2;
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_y(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+ return frame->data[0] + av_rpi_sand_frame_off_y(frame, x, y);
-+}
-+
-+static inline uint8_t * av_rpi_sand_frame_pos_c(const AVFrame * const frame, const unsigned int x, const unsigned int y)
-+{
-+ return frame->data[1] + av_rpi_sand_frame_off_c(frame, x, y);
-+}
-+
-+#endif
-+
-diff --git a/pi-util/BUILD.txt b/pi-util/BUILD.txt
-new file mode 100644
-index 0000000000..b1e99a6a89
---- /dev/null
-+++ b/pi-util/BUILD.txt
-@@ -0,0 +1,25 @@
-+Building Pi FFmpeg
-+==================
-+
-+Configuration:
-+=============
-+
-+pi-util/conf_pi2.sh
-+
-+contains suitable options to build the code for Pi2/3. It expects to find
-+git clones of
-+
-+https://github.com/raspberrypi/tools
-+https://github.com/raspberrypi/firmware
-+
-+in the parent of the FFmpeg directory. I recommend using --depth 1 to avoid a
-+lot of history you don't want.
-+
-+If you have a copy of qasm.py in ../local/bin then the .qasm sources will be
-+rebuilt. Otherwise the prebuilt .c & .h files will be used.
-+Likewise ../local/bin/vasmvidcore_std will enable VPU code rebuild
-+
-+pi-util/conf_p1.sh should configure for Pi1. Beware that as of this time
-+H265 QPU acceleration is broken on Pi1 and so it is disabled.
-+
-+
-diff --git a/pi-util/conf_h265.2016.csv b/pi-util/conf_h265.2016.csv
-new file mode 100644
-index 0000000000..3e90f6893f
---- /dev/null
-+++ b/pi-util/conf_h265.2016.csv
-@@ -0,0 +1,195 @@
-+1,HEVC_v1/AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,HEVC_v1/AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,HEVC_v1/AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,HEVC_v1/AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,HEVC_v1/AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,HEVC_v1/AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,HEVC_v1/BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,HEVC_v1/CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,HEVC_v1/CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,HEVC_v1/CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,HEVC_v1/CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,HEVC_v1/CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,HEVC_v1/CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,HEVC_v1/CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,HEVC_v1/CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,HEVC_v1/CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,HEVC_v1/cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,HEVC_v1/CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,HEVC_v1/CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,HEVC_v1/DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,HEVC_v1/DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,HEVC_v1/DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,HEVC_v1/DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,HEVC_v1/DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,HEVC_v1/DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,HEVC_v1/DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,HEVC_v1/DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,HEVC_v1/ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,HEVC_v1/ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,HEVC_v1/ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,HEVC_v1/EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,HEVC_v1/FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HEVC_v1/HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,HEVC_v1/INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,HEVC_v1/INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,HEVC_v1/ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,HEVC_v1/ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,HEVC_v1/ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,HEVC_v1/ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,HEVC_v1/ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,HEVC_v1/IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,HEVC_v1/IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,HEVC_v1/IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,HEVC_v1/LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,HEVC_v1/LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,HEVC_v1/LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,HEVC_v1/MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,HEVC_v1/MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,HEVC_v1/MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,HEVC_v1/MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,HEVC_v1/MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,HEVC_v1/MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,HEVC_v1/MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,HEVC_v1/MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,HEVC_v1/MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,HEVC_v1/MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,HEVC_v1/MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,HEVC_v1/MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,HEVC_v1/MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,HEVC_v1/NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,HEVC_v1/NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,HEVC_v1/NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,HEVC_v1/OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,HEVC_v1/OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,HEVC_v1/OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,HEVC_v1/PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,HEVC_v1/PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,HEVC_v1/PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,HEVC_v1/PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,HEVC_v1/PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,HEVC_v1/PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,HEVC_v1/PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,HEVC_v1/POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,HEVC_v1/PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,HEVC_v1/PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,HEVC_v1/RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,HEVC_v1/RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,HEVC_v1/RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,HEVC_v1/RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,HEVC_v1/RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,HEVC_v1/RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,HEVC_v1/RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,HEVC_v1/RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,HEVC_v1/RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,HEVC_v1/RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,HEVC_v1/RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,HEVC_v1/RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,HEVC_v1/RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,HEVC_v1/RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,HEVC_v1/RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,HEVC_v1/RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,HEVC_v1/RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,HEVC_v1/SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,HEVC_v1/SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,HEVC_v1/SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,HEVC_v1/SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,HEVC_v1/SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,HEVC_v1/SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,HEVC_v1/SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,HEVC_v1/SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+1,HEVC_v1/SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+1,HEVC_v1/SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,HEVC_v1/SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,HEVC_v1/SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,HEVC_v1/SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,HEVC_v1/SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,HEVC_v1/SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,HEVC_v1/SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,HEVC_v1/SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,HEVC_v1/STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,HEVC_v1/STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,HEVC_v1/TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,HEVC_v1/TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,HEVC_v1/TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,HEVC_v1/TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,HEVC_v1/TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,HEVC_v1/TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,HEVC_v1/TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,HEVC_v1/TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,HEVC_v1/VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,HEVC_v1/VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,HEVC_v1/WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,HEVC_v1/WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,HEVC_v1/WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,HEVC_v1/WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,HEVC_v1/WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-+1,RExt/ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_2.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_yuv_2.md5
-+0,RExt/Bitdepth_A_RExt_Sony_1,Bitdepth_A_RExt_Sony_1.bin,md5sum.txt
-+0,RExt/Bitdepth_B_RExt_Sony_1,Bitdepth_B_RExt_Sony_1.bin,md5sum.txt
-+0,RExt/CCP_10bit_RExt_QCOM,CCP_10bit_RExt_QCOM.bin,CCP_10bit_RExt_QCOM_md5sum.txt
-+0,RExt/CCP_12bit_RExt_QCOM,CCP_12bit_RExt_QCOM.bin,CCP_12bit_RExt_QCOM_md5sum.txt
-+0,RExt/CCP_8bit_RExt_QCOM,CCP_8bit_RExt_QCOM.bin,CCP_8bit_RExt_QCOM_md5sum.txt
-+1,RExt/ExplicitRdpcm_A_BBC_1,ExplicitRdpcm_A_BBC_1.bit,md5sum.txt
-+0,RExt/ExplicitRdpcm_B_BBC_2,ExplicitRdpcm_B_BBC_1.bit,md5sum.txt
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_10BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_12BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_16BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_HIGHTHROUGHPUT_444_16_INTRA_8BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_10BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_12BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_16BIT_RExt_Sony_1.md5
-+0,RExt/EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.bit,EXTPREC_MAIN_444_16_INTRA_8BIT_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_420_RExt_Sony_1,GENERAL_10b_420_RExt_Sony_1.bit,GENERAL_10b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_422_RExt_Sony_1,GENERAL_10b_422_RExt_Sony_1.bit,GENERAL_10b_422_RExt_Sony_1.md5
-+1,RExt/GENERAL_10b_444_RExt_Sony_2,GENERAL_10b_444_RExt_Sony_2.bit,GENERAL_10b_444_RExt_Sony_2.md5
-+1,RExt/GENERAL_12b_400_RExt_Sony_1,GENERAL_12b_400_RExt_Sony_1.bit,GENERAL_12b_400_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_420_RExt_Sony_1,GENERAL_12b_420_RExt_Sony_1.bit,GENERAL_12b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_422_RExt_Sony_1,GENERAL_12b_422_RExt_Sony_1.bit,GENERAL_12b_422_RExt_Sony_1.md5
-+1,RExt/GENERAL_12b_444_RExt_Sony_2,GENERAL_12b_444_RExt_Sony_2.bit,GENERAL_12b_444_RExt_Sony_2.md5
-+0,RExt/GENERAL_16b_400_RExt_Sony_1,GENERAL_16b_400_RExt_Sony_1.bit,GENERAL_16b_400_RExt_Sony_1.md5
-+0,RExt/GENERAL_16b_444_highThroughput_RExt_Sony_2,GENERAL_16b_444_highThroughput_RExt_Sony_2.bit,GENERAL_16b_444_highThroughput_RExt_Sony_2.md5
-+0,RExt/GENERAL_16b_444_RExt_Sony_2,GENERAL_16b_444_RExt_Sony_2.bit,GENERAL_16b_444_RExt_Sony_2.md5
-+1,RExt/GENERAL_8b_400_RExt_Sony_1,GENERAL_8b_400_RExt_Sony_1.bit,GENERAL_8b_400_RExt_Sony_1.md5
-+1,RExt/GENERAL_8b_420_RExt_Sony_1,GENERAL_8b_420_RExt_Sony_1.bit,GENERAL_8b_420_RExt_Sony_1.md5
-+1,RExt/GENERAL_8b_444_RExt_Sony_2,GENERAL_8b_444_RExt_Sony_2.bit,GENERAL_8b_444_RExt_Sony_2.md5
-+1,RExt/IPCM_A_RExt_NEC_2,IPCM_A_RExt_NEC_2.bit,IPCM_A_RExt_NEC_2_yuv.md5
-+1,RExt/IPCM_B_RExt_NEC,IPCM_B_RExt_NEC.bit,IPCM_B_RExt_NEC_yuv.md5
-+1,RExt/Main_422_10_A_RExt_Sony_2,Main_422_10_A_RExt_Sony_2.bin,md5sum.txt
-+1,RExt/Main_422_10_B_RExt_Sony_2,Main_422_10_B_RExt_Sony_2.bin,md5sum.txt
-+1,RExt/PERSIST_RPARAM_A_RExt_Sony_3,PERSIST_RPARAM_A_RExt_Sony_3.bit,PERSIST_RPARAM_A_RExt_Sony_3.md5
-+1,RExt/QMATRIX_A_RExt_Sony_1,QMATRIX_A_RExt_Sony_1.bit,QMATRIX_A_RExt_Sony_1.md5
-+0,RExt/SAO_A_RExt_MediaTek_1,SAO_A_RExt_MediaTek_1.bit,SAO_A_RExt_MediaTek_1.md5, # Runs out of memory - could be fixed
-+0,RExt/TSCTX_10bit_I_RExt_SHARP_1,TSCTX_10bit_I_RExt_SHARP_1.bin,TSCTX_10bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_10bit_RExt_SHARP_1,TSCTX_10bit_RExt_SHARP_1.bin,TSCTX_10bit_RExt_SHARP_1.md5
-+0,RExt/TSCTX_12bit_I_RExt_SHARP_1,TSCTX_12bit_I_RExt_SHARP_1.bin,TSCTX_12bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_12bit_RExt_SHARP_1,TSCTX_12bit_RExt_SHARP_1.bin,TSCTX_12bit_RExt_SHARP_1.md5
-+0,RExt/TSCTX_8bit_I_RExt_SHARP_1,TSCTX_8bit_I_RExt_SHARP_1.bin,TSCTX_8bit_I_RExt_SHARP_1.md5
-+0,RExt/TSCTX_8bit_RExt_SHARP_1,TSCTX_8bit_RExt_SHARP_1.bin,TSCTX_8bit_RExt_SHARP_1.md5
-+0,RExt/WAVETILES_RExt_Sony_2,WAVETILES_RExt_Sony_2.bit,WAVETILES_RExt_Sony_2.md5
-+1,local/sao_cu16_mobile_344x280,sao_cu16_mobile_344x280.265,sao_cu16_mobile_344x280.md5
-+1,local/dblk_cu16_mobile_344x280,dblk_cu16_mobile_344x280.265,dblk_cu16_mobile_344x280.md5
-+1,local/dblksao_cu16_mobile_344x280,dblksao_cu16_mobile_344x280.265,dblksao_cu16_mobile_344x280.md5
-+1,local/dblk_pu32_horses_832x448,dblk_pu32_horses_832x448.265,dblk_pu32_horses_832x448.md5
-+1,local/intra_pred_21_laps,intra_pred_21_laps.265,intra_pred_21_laps.md5
-diff --git a/pi-util/conf_h265.2016_HEVC_v1.csv b/pi-util/conf_h265.2016_HEVC_v1.csv
-new file mode 100644
-index 0000000000..6082641271
---- /dev/null
-+++ b/pi-util/conf_h265.2016_HEVC_v1.csv
-@@ -0,0 +1,147 @@
-+1,AMP_A_Samsung_7,AMP_A_Samsung_7.bin,AMP_A_Samsung_7.md5
-+1,AMP_B_Samsung_7,AMP_B_Samsung_7.bin,AMP_B_Samsung_7.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_7,AMVP_C_Samsung_7.bin,AMVP_C_Samsung_7.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_4,DBLK_A_MAIN10_VIXS_4.bit,DBLK_A_MAIN10_VIXS_4.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_5,MAXBINS_A_TI_5.bit,MAXBINS_A_TI_5_yuv.md5
-+1,MAXBINS_B_TI_5,MAXBINS_B_TI_5.bit,MAXBINS_B_TI_5_yuv.md5
-+1,MAXBINS_C_TI_5,MAXBINS_C_TI_5.bit,MAXBINS_C_TI_5_yuv.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SAO_H_Parabola_1,SAO_H_Parabola_1.bit,SAO_H_Parabola_1.md5
-+2,SAODBLK_A_MainConcept_4,SAODBLK_A_MainConcept_4.bin,SAODBLK_A_MainConcept_4_md5.txt
-+2,SAODBLK_B_MainConcept_4,SAODBLK_B_MainConcept_4.bin,SAODBLK_B_MainConcept_4_md5.txt
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_5,SLIST_A_Sony_5.bin,SLIST_A_Sony_5_yuv.md5
-+1,SLIST_B_Sony_9,SLIST_B_Sony_9.bin,SLIST_B_Sony_9_yuv.md5
-+1,SLIST_C_Sony_4,SLIST_C_Sony_4.bin,SLIST_C_Sony_4_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_7,STRUCT_A_Samsung_7.bin,STRUCT_A_Samsung_7.md5
-+1,STRUCT_B_Samsung_7,STRUCT_B_Samsung_7.bin,STRUCT_B_Samsung_7.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+3,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # unequal bit depth
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+3,VPSSPSPPS_A_MainConcept_1,VPSSPSPPS_A_MainConcept_1.bin,VPSSPSPPS_A_MainConcept_1_md5.txt, # ???
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/conf_h265.csv b/pi-util/conf_h265.csv
-new file mode 100644
-index 0000000000..fc14f2a3c2
---- /dev/null
-+++ b/pi-util/conf_h265.csv
-@@ -0,0 +1,144 @@
-+1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.bit,ADJUST_IPRED_ANGLE_A_RExt_Mitsubishi_1.md5
-+1,AMP_A_Samsung_6,AMP_A_Samsung_6.bin,AMP_A_Samsung_6.md5
-+1,AMP_B_Samsung_6,AMP_B_Samsung_6.bin,AMP_B_Samsung_6.md5
-+1,AMP_D_Hisilicon_3,AMP_D_Hisilicon.bit,AMP_D_Hisilicon_3.yuv.md5
-+1,AMP_E_Hisilicon_3,AMP_E_Hisilicon.bit,AMP_E_Hisilicon_3.yuv.md5
-+1,AMP_F_Hisilicon_3,AMP_F_Hisilicon_3.bit,AMP_F_Hisilicon_3.yuv.md5
-+1,AMVP_A_MTK_4,AMVP_A_MTK_4.bit,AMVP_A_MTK_4.md5
-+1,AMVP_B_MTK_4,AMVP_B_MTK_4.bit,AMVP_B_MTK_4.md5
-+1,AMVP_C_Samsung_6,AMVP_C_Samsung_6.bin,AMVP_C_Samsung_6.md5
-+1,BUMPING_A_ericsson_1,BUMPING_A_ericsson_1.bit,BUMPING_A_ericsson_1.md5
-+1,CAINIT_A_SHARP_4,CAINIT_A_SHARP_4.bit,CAINIT_A_SHARP_4.md5
-+1,CAINIT_B_SHARP_4,CAINIT_B_SHARP_4.bit,CAINIT_B_SHARP_4.md5
-+1,CAINIT_C_SHARP_3,CAINIT_C_SHARP_3.bit,CAINIT_C_SHARP_3.md5
-+1,CAINIT_D_SHARP_3,CAINIT_D_SHARP_3.bit,CAINIT_D_SHARP_3.md5
-+1,CAINIT_E_SHARP_3,CAINIT_E_SHARP_3.bit,CAINIT_E_SHARP_3.md5
-+1,CAINIT_F_SHARP_3,CAINIT_F_SHARP_3.bit,CAINIT_F_SHARP_3.md5
-+1,CAINIT_G_SHARP_3,CAINIT_G_SHARP_3.bit,CAINIT_G_SHARP_3.md5
-+1,CAINIT_H_SHARP_3,CAINIT_H_SHARP_3.bit,CAINIT_H_SHARP_3.md5
-+1,CIP_A_Panasonic_3,CIP_A_Panasonic_3.bit,CIP_A_Panasonic_3_yuv.md5
-+1,cip_B_NEC_3,cip_B_NEC_3.bit,cip_B_NEC_3.md5
-+1,CIP_C_Panasonic_2,CIP_C_Panasonic_2.bit,CIP_C_Panasonic_2_yuv.md5
-+1,CONFWIN_A_Sony_1,CONFWIN_A_Sony_1.bit,CONFWIN_A_Sony_1.md5
-+1,DBLK_A_MAIN10_VIXS_3,DBLK_A_MAIN10_VIXS_3.bit,DBLK_A_MAIN10_VIXS_3.md5
-+1,DBLK_A_SONY_3,DBLK_A_SONY_3.bit,DBLK_A_SONY_3.bit.yuv.md5
-+1,DBLK_B_SONY_3,DBLK_B_SONY_3.bit,DBLK_B_SONY_3.bit.yuv.md5
-+1,DBLK_C_SONY_3,DBLK_C_SONY_3.bit,DBLK_C_SONY_3.bit.yuv.md5
-+1,DBLK_D_VIXS_2,DBLK_D_VIXS_2.bit,DBLK_D_VIXS_2_yuv.md5
-+1,DBLK_E_VIXS_2,DBLK_E_VIXS_2.bit,DBLK_E_VIXS_2_yuv.md5
-+1,DBLK_F_VIXS_2,DBLK_F_VIXS_2.bit,DBLK_F_VIXS_2_yuv.md5
-+1,DBLK_G_VIXS_2,DBLK_G_VIXS_2.bit,DBLK_G_VIXS_2_yuv.md5
-+1,DELTAQP_A_BRCM_4,DELTAQP_A_BRCM_4.bit,DELTAQP_A_BRCM_4_yuv.md5
-+1,DELTAQP_B_SONY_3,DELTAQP_B_SONY_3.bit,DELTAQP_B_SONY_3.bit.yuv.md5
-+1,DELTAQP_C_SONY_3,DELTAQP_C_SONY_3.bit,DELTAQP_C_SONY_3.bit.yuv.md5
-+1,DSLICE_A_HHI_5,DSLICE_A_HHI_5.bin,DSLICE_A_HHI_5.md5
-+1,DSLICE_B_HHI_5,DSLICE_B_HHI_5.bin,DSLICE_B_HHI_5.md5
-+1,DSLICE_C_HHI_5,DSLICE_C_HHI_5.bin,DSLICE_C_HHI_5.md5
-+1,ENTP_A_QUALCOMM_1,ENTP_A_Qualcomm_1.bit,ENTP_A_Qualcomm_1.md5
-+1,ENTP_B_Qualcomm_1,ENTP_B_Qualcomm_1.bit,ENTP_B_Qualcomm_1.md5
-+1,ENTP_C_Qualcomm_1,ENTP_C_Qualcomm_1.bit,ENTP_C_Qualcomm_1.md5
-+1,EXT_A_ericsson_4,EXT_A_ericsson_4.bit,EXT_A_ericsson_4.md5
-+1,FILLER_A_Sony_1,FILLER_A_Sony_1.bit,FILLER_A_Sony_1.md5
-+1,HRD_A_Fujitsu_3,HRD_A_Fujitsu_3.bin,HRD_A_Fujitsu_3.md5
-+1,INITQP_A_Sony_1,INITQP_A_Sony_1.bit,INITQP_A_Sony_1.md5
-+1,INITQP_B_Main10_Sony_1,INITQP_B_Main10_Sony_1.bit,INITQP_B_Main10_Sony_1.md5
-+1,ipcm_A_NEC_3,ipcm_A_NEC_3.bit,ipcm_A_NEC_3.md5
-+1,ipcm_B_NEC_3,ipcm_B_NEC_3.bit,ipcm_B_NEC_3.md5
-+1,ipcm_C_NEC_3,ipcm_C_NEC_3.bit,ipcm_C_NEC_3.md5
-+1,ipcm_D_NEC_3,ipcm_D_NEC_3.bit,ipcm_D_NEC_3.md5
-+1,ipcm_E_NEC_2,ipcm_E_NEC_2.bit,ipcm_E_NEC_2.md5
-+1,IPRED_A_docomo_2,IPRED_A_docomo_2.bit,IPRED_A_docomo_2.md5
-+1,IPRED_B_Nokia_3,IPRED_B_Nokia_3.bit,IPRED_B_Nokia_3_yuv.md5
-+1,IPRED_C_Mitsubishi_3,IPRED_C_Mitsubishi_3.bit,IPRED_C_Mitsubishi_3_yuv.md5
-+1,LS_A_Orange_2,LS_A_Orange_2.bit,LS_A_Orange_2_yuv.md5
-+1,LS_B_Orange_4,LS_B_Orange_4.bit,LS_B_Orange_4_yuv.md5
-+1,LTRPSPS_A_Qualcomm_1,LTRPSPS_A_Qualcomm_1.bit,LTRPSPS_A_Qualcomm_1.md5
-+1,MAXBINS_A_TI_4,MAXBINS_A_TI_4.bit,MAXBINS_A_TI_4.md5
-+1,MAXBINS_B_TI_4,MAXBINS_B_TI_4.bit,MAXBINS_B_TI_4.md5
-+1,MAXBINS_C_TI_4,MAXBINS_C_TI_4.bit,MAXBINS_C_TI_4.md5
-+1,MERGE_A_TI_3,MERGE_A_TI_3.bit,MERGE_A_TI_3.md5
-+1,MERGE_B_TI_3,MERGE_B_TI_3.bit,MERGE_B_TI_3.md5
-+1,MERGE_C_TI_3,MERGE_C_TI_3.bit,MERGE_C_TI_3.md5
-+1,MERGE_D_TI_3,MERGE_D_TI_3.bit,MERGE_D_TI_3.md5
-+1,MERGE_E_TI_3,MERGE_E_TI_3.bit,MERGE_E_TI_3.md5
-+1,MERGE_F_MTK_4,MERGE_F_MTK_4.bit,MERGE_F_MTK_4.md5
-+1,MERGE_G_HHI_4,MERGE_G_HHI_4.bit,MERGE_G_HHI_4.md5
-+1,MVCLIP_A_qualcomm_3,MVCLIP_A_qualcomm_3.bit,MVCLIP_A_qualcomm_3.yuv.md5
-+1,MVDL1ZERO_A_docomo_4,MVDL1ZERO_A_docomo_4.bit,MVDL1ZERO_A_docomo_4.md5
-+1,MVEDGE_A_qualcomm_3,MVEDGE_A_qualcomm_3.bit,MVEDGE_A_qualcomm_3.yuv.md5
-+1,NoOutPrior_A_Qualcomm_1,NoOutPrior_A_Qualcomm_1.bit,NoOutPrior_A_Qualcomm_1.md5
-+1,NoOutPrior_B_Qualcomm_1,NoOutPrior_B_Qualcomm_1.bit,NoOutPrior_B_Qualcomm_1.md5
-+1,NUT_A_ericsson_5,NUT_A_ericsson_5.bit,NUT_A_ericsson_5.md5
-+1,OPFLAG_A_Qualcomm_1,OPFLAG_A_Qualcomm_1.bit,OPFLAG_A_Qualcomm_1.md5
-+1,OPFLAG_B_Qualcomm_1,OPFLAG_B_Qualcomm_1.bit,OPFLAG_B_Qualcomm_1.md5
-+1,OPFLAG_C_Qualcomm_1,OPFLAG_C_Qualcomm_1.bit,OPFLAG_C_Qualcomm_1.md5
-+1,PICSIZE_A_Bossen_1,PICSIZE_A_Bossen_1.bin,PICSIZE_A_Bossen_1.md5
-+1,PICSIZE_B_Bossen_1,PICSIZE_B_Bossen_1.bin,PICSIZE_B_Bossen_1.md5
-+1,PICSIZE_C_Bossen_1,PICSIZE_C_Bossen_1.bin,PICSIZE_C_Bossen_1.md5
-+1,PICSIZE_D_Bossen_1,PICSIZE_D_Bossen_1.bin,PICSIZE_D_Bossen_1.md5
-+1,PMERGE_A_TI_3,PMERGE_A_TI_3.bit,PMERGE_A_TI_3.md5
-+1,PMERGE_B_TI_3,PMERGE_B_TI_3.bit,PMERGE_B_TI_3.md5
-+1,PMERGE_C_TI_3,PMERGE_C_TI_3.bit,PMERGE_C_TI_3.md5
-+1,PMERGE_D_TI_3,PMERGE_D_TI_3.bit,PMERGE_D_TI_3.md5
-+1,PMERGE_E_TI_3,PMERGE_E_TI_3.bit,PMERGE_E_TI_3.md5
-+1,POC_A_Bossen_3,POC_A_Bossen_3.bin,POC_A_Bossen_3.md5
-+1,PPS_A_qualcomm_7,PPS_A_qualcomm_7.bit,PPS_A_qualcomm_7.yuv.md5
-+1,PS_B_VIDYO_3,PS_B_VIDYO_3.bit,PS_B_VIDYO_3_yuv.md5
-+1,RAP_A_docomo_6,RAP_A_docomo_6.bit,RAP_A_docomo_6.md5
-+1,RAP_B_Bossen_2,RAP_B_Bossen_2.bit,RAP_B_Bossen_2.md5
-+1,RPLM_A_qualcomm_4,RPLM_A_qualcomm_4.bit,RPLM_A_qualcomm_4.yuv.md5
-+1,RPLM_B_qualcomm_4,RPLM_B_qualcomm_4.bit,RPLM_B_qualcomm_4.yuv.md5
-+1,RPS_A_docomo_5,RPS_A_docomo_5.bit,RPS_A_docomo_5.md5
-+1,RPS_B_qualcomm_5,RPS_B_qualcomm_5.bit,RPS_B_qualcomm_5.yuv.md5
-+1,RPS_C_ericsson_5,RPS_C_ericsson_5.bit,RPS_C_ericsson_5.md5
-+1,RPS_D_ericsson_6,RPS_D_ericsson_6.bit,RPS_D_ericsson_6.md5
-+1,RPS_E_qualcomm_5,RPS_E_qualcomm_5.bit,RPS_E_qualcomm_5.yuv.md5
-+1,RPS_F_docomo_2,RPS_F_docomo_2.bit,RPS_F_docomo_2.md5
-+1,RQT_A_HHI_4,RQT_A_HHI_4.bit,RQT_A_HHI_4.md5
-+1,RQT_B_HHI_4,RQT_B_HHI_4.bit,RQT_B_HHI_4.md5
-+1,RQT_C_HHI_4,RQT_C_HHI_4.bit,RQT_C_HHI_4.md5
-+1,RQT_D_HHI_4,RQT_D_HHI_4.bit,RQT_D_HHI_4.md5
-+1,RQT_E_HHI_4,RQT_E_HHI_4.bit,RQT_E_HHI_4.md5
-+1,RQT_F_HHI_4,RQT_F_HHI_4.bit,RQT_F_HHI_4.md5
-+1,RQT_G_HHI_4,RQT_G_HHI_4.bit,RQT_G_HHI_4.md5
-+1,SAO_A_MediaTek_4,SAO_A_MediaTek_4.bit,SAO_A_MediaTek_4.md5
-+1,SAO_B_MediaTek_5,SAO_B_MediaTek_5.bit,SAO_B_MediaTek_5.md5
-+1,SAO_C_Samsung_5,SAO_C_Samsung_5.bin,SAO_C_Samsung_5.md5
-+1,SAO_D_Samsung_5,SAO_D_Samsung_5.bin,SAO_D_Samsung_5.md5
-+1,SAO_E_Canon_4,SAO_E_Canon_4.bit,SAO_E_Canon_4.md5
-+1,SAO_F_Canon_3,SAO_F_Canon_3.bit,SAO_F_Canon_3.md5
-+1,SAO_G_Canon_3,SAO_G_Canon_3.bit,SAO_G_Canon_3.md5
-+1,SDH_A_Orange_4,SDH_A_Orange_4.bit,SDH_A_Orange_4_yuv.md5
-+1,SLICES_A_Rovi_3,SLICES_A_Rovi_3.bin,SLICES_A_Rovi_3.md5
-+1,SLIST_A_Sony_4,str.bin,SLIST_A_Sony_4_yuv.md5
-+1,SLIST_B_Sony_8,str.bin,SLIST_B_Sony_8_yuv.md5
-+1,SLIST_C_Sony_3,str.bin,SLIST_C_Sony_3_yuv.md5
-+1,SLIST_D_Sony_9,str.bin,SLIST_D_Sony_9_yuv.md5
-+1,SLPPLP_A_VIDYO_2,SLPPLP_A_VIDYO_2.bit,SLPPLP_A_VIDYO_2_yuv.md5
-+1,STRUCT_A_Samsung_6,STRUCT_A_Samsung_6.bin,STRUCT_A_Samsung_6.md5
-+1,STRUCT_B_Samsung_6,STRUCT_B_Samsung_6.bin,STRUCT_B_Samsung_6.md5
-+1,TILES_A_Cisco_2,TILES_A_Cisco_2.bin,TILES_A_Cisco_2_yuv.md5
-+1,TILES_B_Cisco_1,TILES_B_Cisco_1.bin,TILES_B_Cisco_1_yuv.md5
-+1,TMVP_A_MS_3,TMVP_A_MS_3.bit,TMVP_A_MS_3.yuv.md5
-+1,TSCL_A_VIDYO_5,TSCL_A_VIDYO_5.bit,TSCL_A_VIDYO_5_yuv.md5
-+1,TSCL_B_VIDYO_4,TSCL_B_VIDYO_4.bit,TSCL_B_VIDYO_4_yuv.md5
-+1,TSKIP_A_MS_3,TSKIP_A_MS_3.bit,TSKIP_A_MS_3.yuv.md5
-+0,TSUNEQBD_A_MAIN10_Technicolor_2,TSUNEQBD_A_MAIN10_Technicolor_2.bit,TSUNEQBD_A_MAIN10_Technicolor_2_yuv.md5, # Y/C bit depth unmatched
-+1,TUSIZE_A_Samsung_1,TUSIZE_A_Samsung_1.bin,TUSIZE_A_Samsung_1.md5
-+1,VPSID_A_VIDYO_2,VPSID_A_VIDYO_2.bit,VPSID_A_VIDYO_2_yuv.md5
-+1,WP_A_MAIN10_Toshiba_3,WP_A_MAIN10_Toshiba_3.bit,WP_A_MAIN10_Toshiba_3_yuv.md5
-+1,WP_A_Toshiba_3,WP_A_Toshiba_3.bit,WP_A_Toshiba_3_yuv.md5
-+1,WP_B_Toshiba_3,WP_B_Toshiba_3.bit,WP_B_Toshiba_3_yuv.md5
-+1,WP_MAIN10_B_Toshiba_3,WP_MAIN10_B_Toshiba_3.bit,WP_MAIN10_B_Toshiba_3_yuv.md5
-+1,WPP_A_ericsson_MAIN10_2,WPP_A_ericsson_MAIN10_2.bit,WPP_A_ericsson_MAIN10_yuv.md5
-+1,WPP_A_ericsson_MAIN_2,WPP_A_ericsson_MAIN_2.bit,WPP_A_ericsson_MAIN_2_yuv.md5
-+1,WPP_B_ericsson_MAIN10_2,WPP_B_ericsson_MAIN10_2.bit,WPP_B_ericsson_MAIN10_yuv.md5
-+1,WPP_B_ericsson_MAIN_2,WPP_B_ericsson_MAIN_2.bit,WPP_B_ericsson_MAIN_2_yuv.md5
-+1,WPP_C_ericsson_MAIN10_2,WPP_C_ericsson_MAIN10_2.bit,WPP_C_ericsson_MAIN10_yuv.md5
-+1,WPP_C_ericsson_MAIN_2,WPP_C_ericsson_MAIN_2.bit,WPP_C_ericsson_MAIN_2_yuv.md5
-+1,WPP_D_ericsson_MAIN10_2,WPP_D_ericsson_MAIN10_2.bit,WPP_D_ericsson_MAIN10_yuv.md5
-+1,WPP_D_ericsson_MAIN_2,WPP_D_ericsson_MAIN_2.bit,WPP_D_ericsson_MAIN_2_yuv.md5
-+1,WPP_E_ericsson_MAIN10_2,WPP_E_ericsson_MAIN10_2.bit,WPP_E_ericsson_MAIN10_yuv.md5
-+1,WPP_E_ericsson_MAIN_2,WPP_E_ericsson_MAIN_2.bit,WPP_E_ericsson_MAIN_2_yuv.md5
-+1,WPP_F_ericsson_MAIN10_2,WPP_F_ericsson_MAIN10_2.bit,WPP_F_ericsson_MAIN10_yuv.md5
-+1,WPP_F_ericsson_MAIN_2,WPP_F_ericsson_MAIN_2.bit,WPP_F_ericsson_MAIN_2_yuv.md5
-diff --git a/pi-util/conf_pi1.sh b/pi-util/conf_pi1.sh
-new file mode 100755
-index 0000000000..59c0d3959e
---- /dev/null
-+++ b/pi-util/conf_pi1.sh
-@@ -0,0 +1,30 @@
-+echo "Configure for Pi1"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --cpu=arm1176jzf-s\
-+ --arch=arm\
-+ --disable-neon\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --enable-mmal\
-+ --extra-cflags="-g $RPI_KEEPS $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/conf_pi2.sh b/pi-util/conf_pi2.sh
-new file mode 100755
-index 0000000000..40549a35e5
---- /dev/null
-+++ b/pi-util/conf_pi2.sh
-@@ -0,0 +1,32 @@
-+echo "Configure for Pi2/3"
-+
-+RPI_TOOLROOT=`pwd`/../tools/arm-bcm2708/arm-rpi-4.9.3-linux-gnueabihf
-+RPI_OPT_VC=`pwd`/../firmware/hardfp/opt/vc
-+
-+RPI_INCLUDES="-I$RPI_OPT_VC/include -I$RPI_OPT_VC/include/interface/vcos/pthreads -I$RPI_OPT_VC/include/interface/vmcs_host/linux"
-+RPI_LIBDIRS="-L$RPI_TOOLROOT/lib -L$RPI_OPT_VC/lib"
-+RPI_DEFINES="-D__VCCOREVER__=0x4000000 -mfpu=neon-vfpv4"
-+#RPI_KEEPS="-save-temps=obj"
-+RPI_KEEPS=""
-+
-+./configure --enable-cross-compile\
-+ --arch=armv6t2\
-+ --cpu=cortex-a7\
-+ --target-os=linux\
-+ --disable-stripping\
-+ --disable-thumb\
-+ --enable-mmal\
-+ --enable-rpi\
-+ --extra-cflags="-ggdb $RPI_KEEPS $RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-cxxflags="$RPI_DEFINES $RPI_INCLUDES"\
-+ --extra-ldflags="$RPI_LIBDIRS -Wl,-rpath=/opt/vc/lib,-rpath-link=$RPI_OPT_VC/lib,-rpath=/lib,-rpath=/usr/lib,-rpath-link=$RPI_TOOLROOT/lib,-rpath-link=$RPI_TOOLROOT/lib"\
-+ --extra-libs="-Wl,--start-group -lbcm_host -lmmal -lmmal_util -lmmal_core -lvcos -lvcsm -lvchostif -lvchiq_arm"\
-+ --cross-prefix=$RPI_TOOLROOT/bin/arm-linux-gnueabihf-
-+
-+# --enable-decoder=hevc_rpi\
-+# --enable-extra-warnings\
-+# --arch=armv71\
-+# --enable-shared\
-+
-+# gcc option for getting asm listing
-+# -Wa,-ahls
-diff --git a/pi-util/ffconf.py b/pi-util/ffconf.py
-new file mode 100755
-index 0000000000..e9556f0837
---- /dev/null
-+++ b/pi-util/ffconf.py
-@@ -0,0 +1,175 @@
-+#!/usr/bin/env python
-+
-+import string
-+import os
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+ffmpeg_exec = "./ffmpeg"
-+
-+def testone(fileroot, srcname, es_file, md5_file, vcodec):
-+ tmp_root = "/tmp"
-+
-+ names = srcname.split('/')
-+ while len(names) > 1:
-+ tmp_root = os.path.join(tmp_root, names[0])
-+ del names[0]
-+ name = names[0]
-+
-+ if not os.path.exists(tmp_root):
-+ os.makedirs(tmp_root)
-+
-+ dec_file = os.path.join(tmp_root, name + ".dec.md5")
-+ try:
-+ os.remove(dec_file)
-+ except:
-+ pass
-+
-+ flog = open(os.path.join(tmp_root, name + ".log"), "wt")
-+
-+ # Unaligned needed for cropping conformance
-+ rstr = subprocess.call(
-+ [ffmpeg_exec, "-flags", "unaligned", "-vcodec", vcodec, "-i", os.path.join(fileroot, es_file), "-f", "md5", dec_file],
-+ stdout=flog, stderr=subprocess.STDOUT)
-+
-+ try:
-+ m1 = None
-+ m2 = None
-+ with open(os.path.join(fileroot, md5_file)) as f:
-+ for line in f:
-+ m1 = re.search("[0-9a-f]{32}", line.lower())
-+ if m1:
-+ break
-+
-+ with open(dec_file) as f:
-+ m2 = re.search("[0-9a-f]{32}", f.readline())
-+ except:
-+ pass
-+
-+ if m1 and m2 and m1.group() == m2.group():
-+ print >> flog, "Match: " + m1.group()
-+ rv = 0
-+ elif not m1:
-+ print >> flog, "****** Cannot find m1"
-+ rv = 3
-+ elif not m2:
-+ print >> flog, "****** Cannot find m2"
-+ rv = 2
-+ else:
-+ print >> flog, "****** Mismatch: " + m1.group() + " != " + m2.group()
-+ rv = 1
-+ flog.close()
-+ return rv
-+
-+def scandir(root):
-+ aconf = []
-+ ents = os.listdir(root)
-+ ents.sort(key=str.lower)
-+ for name in ents:
-+ test_path = os.path.join(root, name)
-+ if S_ISDIR(os.stat(test_path).st_mode):
-+ files = os.listdir(test_path)
-+ es_file = "?"
-+ md5_file = "?"
-+ for f in files:
-+ (base, ext) = os.path.splitext(f)
-+ if base[0] == '.':
-+ pass
-+ elif ext == ".bit" or ext == ".bin":
-+ es_file = f
-+ elif ext == ".md5" or (ext == ".txt" and (base[-4:] == "_md5" or base[-6:] == "md5sum")):
-+ if md5_file == "?":
-+ md5_file = f
-+ elif base[-3:] == "yuv":
-+ md5_file = f
-+ aconf.append((1, name, es_file, md5_file))
-+ return aconf
-+
-+def runtest(name, tests):
-+ if not tests:
-+ return True
-+ for t in tests:
-+ if name[0:len(t)] == t or name.find("/" + t) != -1:
-+ return True
-+ return False
-+
-+def doconf(csva, tests, test_root, vcodec):
-+ unx_failures = []
-+ unx_success = []
-+ failures = 0
-+ successes = 0
-+ for a in csva:
-+ exp_test = int(a[0])
-+ if (exp_test and runtest(a[1], tests)):
-+ name = a[1]
-+ print "==== ", name,
-+ sys.stdout.flush()
-+
-+ rv = testone(os.path.join(test_root, name), name, a[2], a[3], vcodec=vcodec)
-+ if (rv == 0):
-+ successes += 1
-+ else:
-+ failures += 1
-+
-+ if (rv == 0):
-+ if exp_test == 2:
-+ print ": * OK *"
-+ unx_success.append(name)
-+ else:
-+ print ": ok"
-+ elif exp_test == 2 and rv == 1:
-+ print ": fail"
-+ elif exp_test == 3 and rv == 2:
-+ # Call an expected "crash" an abort
-+ print ": abort"
-+ else:
-+ unx_failures.append(name)
-+ if rv == 1:
-+ print ": * FAIL *"
-+ elif (rv == 2) :
-+ print ": * CRASH *"
-+ elif (rv == 3) :
-+ print ": * MD5 MISSING *"
-+ else :
-+ print ": * BANG *"
-+
-+ if unx_failures or unx_success:
-+ print "Unexpected Failures:", unx_failures
-+ print "Unexpected Success: ", unx_success
-+ else:
-+ print "All tests normal:", successes, "ok,", failures, "failed"
-+
-+
-+class ConfCSVDialect(csv.Dialect):
-+ delimiter = ','
-+ doublequote = True
-+ lineterminator = '\n'
-+ quotechar='"'
-+ quoting = csv.QUOTE_MINIMAL
-+ skipinitialspace = True
-+ strict = True
-+
-+if __name__ == '__main__':
-+
-+ argp = argparse.ArgumentParser(description="FFmpeg h265 conformance tester")
-+ argp.add_argument("tests", nargs='*')
-+ argp.add_argument("--test_root", default="/opt/conform/h265.2016", help="Root dir for test")
-+ argp.add_argument("--csvgen", action='store_true', help="Generate CSV file for dir")
-+ argp.add_argument("--csv", default="pi-util/conf_h265.2016.csv", help="CSV filename")
-+ argp.add_argument("--vcodec", default="hevc_rpi", help="vcodec name to use")
-+ args = argp.parse_args()
-+
-+ if args.csvgen:
-+ csv.writer(sys.stdout).writerows(scandir(args.test_root))
-+ exit(0)
-+
-+ with open(args.csv, 'rt') as csvfile:
-+ csva = [a for a in csv.reader(csvfile, ConfCSVDialect())]
-+
-+
-+ doconf(csva, args.tests, args.test_root, args.vcodec)
-+
-diff --git a/pi-util/ffperf.py b/pi-util/ffperf.py
-new file mode 100755
-index 0000000000..8bb326943f
---- /dev/null
-+++ b/pi-util/ffperf.py
-@@ -0,0 +1,125 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+ close_threshold = 0.01
-+
-+ def __init__(self, stats_dict=None):
-+ if stats_dict != None:
-+ self.name = stats_dict["name"]
-+ self.elapsed = float(stats_dict["elapsed"])
-+ self.user = float(stats_dict["user"])
-+ self.sys = float(stats_dict["sys"])
-+
-+ def times_str(self):
-+ ctime = self.sys + self.user
-+ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+ def dict(self):
-+ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+ def is_close(self, other):
-+ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+ def __lt__(self, other):
-+ return self.elapsed < other.elapsed
-+ def __gt__(self, other):
-+ return self.elapsed > other.elapsed
-+
-+ def time_file(name, prefix):
-+ stats = tstats()
-+ stats.name = name
-+ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
-+ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+ pinfo = os.wait4(cproc.pid, 0)
-+ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+ stats.elapsed = end_time - start_time
-+ stats.user = pinfo[2].ru_utime
-+ stats.sys = pinfo[2].ru_stime
-+ return stats
-+
-+
-+def common_prefix(s1, s2):
-+ for i in range(min(len(s1),len(s2))):
-+ if s1[i] != s2[i]:
-+ return s1[:i]
-+ return s1[:i+1]
-+
-+def main():
-+ global flog
-+
-+ argp = argparse.ArgumentParser(description="FFmpeg performance tester", epilog="""
-+To blank the screen before starting use "xdg-screensaver activate"
-+(For some reason this doesn't seem to work from within python).
-+""")
-+
-+ argp.add_argument("streams", nargs='*')
-+ argp.add_argument("--csv_out", default="ffperf_out.csv", help="CSV output filename")
-+ argp.add_argument("--csv_in", help="CSV input filename")
-+ argp.add_argument("--prefix", help="Filename prefix (include terminal '/' if a directory).")
-+ argp.add_argument("--repeat", default=3, type=int, help="Run repeat count")
-+
-+ args = argp.parse_args()
-+
-+ csv_out = csv.DictWriter(open(args.csv_out, 'w', newline=''), ["name", "elapsed", "user", "sys"])
-+ csv_out.writeheader()
-+
-+ stats_in = {}
-+ if args.csv_in != None:
-+ with open(args.csv_in, 'r', newline='') as f_in:
-+ stats_in = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+ flog = open(os.path.join(tempfile.gettempdir(), "ffperf.log"), "wt")
-+
-+ streams = args.streams
-+ if not streams:
-+ if not stats_in:
-+ print ("No source streams specified")
-+ return 1
-+ prefix = "" if args.prefix == None else args.prefix
-+ streams = [k for k in stats_in]
-+ elif args.prefix != None:
-+ prefix = args.prefix
-+ else:
-+ prefix = streams[0]
-+ for f in streams[1:]:
-+ prefix = common_prefix(prefix, f)
-+ pp = prefix.rpartition(os.sep)
-+ prefix = pp[0] + pp[1]
-+ streams = [s[len(prefix):] for s in streams]
-+
-+ for f in sorted(streams, key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+ print ("====", f)
-+
-+ t0 = tstats({"name":f, "elapsed":999, "user":999, "sys":999})
-+ for i in range(args.repeat):
-+ t = tstats.time_file(f, prefix)
-+ print ("...", t.times_str())
-+ if t0 > t:
-+ t0 = t
-+
-+ if t0.name in stats_in:
-+ pstat = stats_in[t0.name]
-+ print("---" if pstat.is_close(t0) else "<<<" if t0 < pstat else ">>>", pstat.times_str())
-+
-+ csv_out.writerow(t0.dict())
-+
-+ print ()
-+
-+ return 0
-+
-+
-+if __name__ == '__main__':
-+ exit(main())
-+
-diff --git a/pi-util/make_array.py b/pi-util/make_array.py
-new file mode 100755
-index 0000000000..67b22d2d51
---- /dev/null
-+++ b/pi-util/make_array.py
-@@ -0,0 +1,23 @@
-+#!/usr/bin/env python
-+
-+# Usage
-+# make_array file.bin
-+# Produces file.h with array of bytes.
-+#
-+import sys
-+for file in sys.argv[1:]:
-+ prefix,suffix = file.split('.')
-+ assert suffix=='bin'
-+ name=prefix.split('/')[-1]
-+ print 'Converting',file
-+ with open(prefix+'.h','wb') as out:
-+ print >>out, 'static const unsigned char',name,'[] = {'
-+ with open(file,'rb') as fd:
-+ i = 0
-+ for byte in fd.read():
-+ print >>out, '0x%02x, ' % ord(byte),
-+ i = i + 1
-+ if i % 8 == 0:
-+ print >>out, ' // %04x' % (i - 8)
-+ print >>out,'};'
-+
-diff --git a/pi-util/perfcmp.py b/pi-util/perfcmp.py
-new file mode 100644
-index 0000000000..e44cfa0c3c
---- /dev/null
-+++ b/pi-util/perfcmp.py
-@@ -0,0 +1,101 @@
-+#!/usr/bin/env python3
-+
-+import time
-+import string
-+import os
-+import tempfile
-+import subprocess
-+import re
-+import argparse
-+import sys
-+import csv
-+from stat import *
-+
-+class tstats:
-+ close_threshold = 0.01
-+
-+ def __init__(self, stats_dict=None):
-+ if stats_dict != None:
-+ self.name = stats_dict["name"]
-+ self.elapsed = float(stats_dict["elapsed"])
-+ self.user = float(stats_dict["user"])
-+ self.sys = float(stats_dict["sys"])
-+
-+ def times_str(self):
-+ ctime = self.sys + self.user
-+ return "time=%6.2f, cpu=%6.2f (%4.2f%%)" % (self.elapsed, ctime, (ctime * 100.0) / self.elapsed)
-+
-+ def dict(self):
-+ return {"name":self.name, "elapsed":self.elapsed, "user":self.user, "sys":self.sys}
-+
-+ def is_close(self, other):
-+ return abs(self.elapsed - other.elapsed) / self.elapsed < self.close_threshold
-+
-+ def __lt__(self, other):
-+ return self.elapsed < other.elapsed
-+ def __gt__(self, other):
-+ return self.elapsed > other.elapsed
-+
-+ def time_file(name, prefix):
-+ stats = tstats()
-+ stats.name = name
-+ start_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+ cproc = subprocess.Popen(["./ffmpeg", "-t", "30", "-i", prefix + name,
-+ "-f", "null", os.devnull], bufsize=-1, stdout=flog, stderr=flog);
-+ pinfo = os.wait4(cproc.pid, 0)
-+ end_time = time.clock_gettime(time.CLOCK_MONOTONIC);
-+ stats.elapsed = end_time - start_time
-+ stats.user = pinfo[2].ru_utime
-+ stats.sys = pinfo[2].ru_stime
-+ return stats
-+
-+
-+def common_prefix(s1, s2):
-+ for i in range(min(len(s1),len(s2))):
-+ if s1[i] != s2[i]:
-+ return s1[:i]
-+ return s1[:i+1]
-+
-+def main():
-+ argp = argparse.ArgumentParser(description="FFmpeg performance compare")
-+
-+ argp.add_argument("stream0", help="CSV to compare")
-+ argp.add_argument("stream1", nargs='?', default="ffperf_out.csv", help="CSV to compare")
-+
-+ args = argp.parse_args()
-+
-+ with open(args.stream0, 'r', newline='') as f_in:
-+ stats0 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+ with open(args.stream1, 'r', newline='') as f_in:
-+ stats1 = {x["name"]:tstats(x) for x in csv.DictReader(f_in)}
-+
-+ print (args.stream0, "<<-->>", args.stream1)
-+ print ()
-+
-+ for f in sorted(stats0.keys() | stats1.keys(), key=lambda x : "~" * x.count(os.sep) + x.lower()):
-+ if not (f in stats0) :
-+ print (" XX :", f)
-+ continue
-+ if not (f in stats1) :
-+ print (" XX :", f)
-+ continue
-+
-+ s0 = stats0[f]
-+ s1 = stats1[f]
-+
-+ pcent = ((s0.elapsed - s1.elapsed) / s0.elapsed) * 100.0
-+ thresh = 0.3
-+ tc = 6
-+
-+ nchar = min(tc - 1, int(abs(pcent) / thresh))
-+ cc = " -- " if nchar == 0 else "<" * nchar + " " * (tc - nchar) if pcent < 0 else " " * (tc - nchar) + ">" * nchar
-+
-+ print ("%6.2f %s%6.2f (%+5.2f) : %s" %
-+ (s0.elapsed, cc, s1.elapsed, pcent, f))
-+
-+ return 0
-+
-+
-+if __name__ == '__main__':
-+ exit(main())
-+
-diff --git a/pi-util/qem.sh b/pi-util/qem.sh
-new file mode 100755
-index 0000000000..a4dbb6eacd
---- /dev/null
-+++ b/pi-util/qem.sh
-@@ -0,0 +1,9 @@
-+TARGET_DIR=../src/eupton_vc4dev_2012a/software/vc4/DEV/applications/tutorials/user_shader_example_tex
-+QASM=python\ ../local/bin/qasm.py
-+SRC_FILE=libavcodec/rpi_hevc_shader.qasm
-+DST_BASE=shader
-+
-+cp libavcodec/rpi_hevc_shader_cmd.h $TARGET_DIR
-+$QASM -mc_c:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.c
-+$QASM -mc_h:$DST_BASE,$DST_BASE,$DST_BASE $SRC_FILE > $TARGET_DIR/$DST_BASE.h
-+
-diff --git a/pi-util/v3dusage.py b/pi-util/v3dusage.py
-new file mode 100755
-index 0000000000..5935a11ca5
---- /dev/null
-+++ b/pi-util/v3dusage.py
-@@ -0,0 +1,128 @@
-+#!/usr/bin/env python
-+
-+import sys
-+import argparse
-+import re
-+
-+def do_logparse(logname):
-+
-+ rmatch = re.compile(r'^([0-9]+\.[0-9]{3}): (done )?((vpu0)|(vpu1)|(qpu1)) ([A-Z_]+) cb:([0-9a-f]+) ')
-+ rqcycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs doing vertex/coordinate shading +([0-9]+)$')
-+ rqtscycle = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: QPU Total clock cycles for all QPUs stalled waiting for TMUs +([0-9]+)$')
-+ rl2hits = re.compile(r'^([0-9]+\.[0-9]{3}): v3d: L2C Total Level 2 cache ([a-z]+) +([0-9]+)$')
-+
-+ ttotal = {'idle':0.0}
-+ tstart = {}
-+ qctotal = {}
-+ qtstotal = {}
-+ l2hits = {}
-+ l2total = {}
-+ time0 = None
-+ idle_start = None
-+ qpu_op_no = 0
-+ op_count = 0
-+
-+ with open(logname, "rt") as infile:
-+ for line in infile:
-+ match = rmatch.match(line)
-+ if match:
-+# print match.group(1), ":", match.group(2), ":", match.group(3), ":", match.group(7), ":"
-+ time = float(match.group(1))
-+ unit = match.group(3)
-+ opstart = not match.group(2)
-+ optype = match.group(7)
-+ hascb = match.group(8) != "0"
-+
-+ if unit == 'qpu1':
-+ unit = unit + "." + str(qpu_op_no)
-+ if not opstart:
-+ if hascb or optype == 'EXECUTE_SYNC':
-+ qpu_op_no = 0
-+ else:
-+ qpu_op_no += 1
-+
-+ # Ignore sync type
-+ if optype == 'EXECUTE_SYNC':
-+ continue
-+
-+ if not time0:
-+ time0 = time
-+
-+ if opstart:
-+ tstart[unit] = time;
-+ elif unit in tstart:
-+ op_count += 1
-+ if not unit in ttotal:
-+ ttotal[unit] = 0.0
-+ ttotal[unit] += time - tstart[unit]
-+ del tstart[unit]
-+
-+ if not idle_start and not tstart:
-+ idle_start = time
-+ elif idle_start and tstart:
-+ ttotal['idle'] += time - idle_start
-+ idle_start = None
-+
-+ match = rqcycle.match(line)
-+ if match:
-+ unit = "qpu1." + str(qpu_op_no)
-+ if not unit in qctotal:
-+ qctotal[unit] = 0
-+ qctotal[unit] += int(match.group(2))
-+
-+ match = rqtscycle.match(line)
-+ if match:
-+ unit = "qpu1." + str(qpu_op_no)
-+ if not unit in qtstotal:
-+ qtstotal[unit] = 0
-+ qtstotal[unit] += int(match.group(2))
-+
-+ match = rl2hits.match(line)
-+ if match:
-+ unit = "qpu1." + str(qpu_op_no)
-+ if not unit in l2total:
-+ l2total[unit] = 0
-+ l2hits[unit] = 0
-+ l2total[unit] += int(match.group(3))
-+ if match.group(2) == "hits":
-+ l2hits[unit] += int(match.group(3))
-+
-+
-+ if not time0:
-+ print "No v3d profile records found"
-+ else:
-+ tlogged = time - time0
-+
-+ print "Logged time:", tlogged, " Op count:", op_count
-+ for unit in sorted(ttotal):
-+ print b'%6s: %10.3f %7.3f%%' % (unit, ttotal[unit], ttotal[unit] * 100.0 / tlogged)
-+ print
-+ for unit in sorted(qctotal):
-+ if not unit in qtstotal:
-+ qtstotal[unit] = 0;
-+ print b'%6s: Qcycles: %10d, TMU stall: %10d (%7.3f%%)' % (unit, qctotal[unit], qtstotal[unit], (qtstotal[unit] * 100.0)/qctotal[unit])
-+ if unit in l2total:
-+ print b' L2Total: %10d, hits: %10d (%7.3f%%)' % (l2total[unit], l2hits[unit], (l2hits[unit] * 100.0)/l2total[unit])
-+
-+
-+
-+if __name__ == '__main__':
-+ argp = argparse.ArgumentParser(
-+ formatter_class=argparse.RawDescriptionHelpFormatter,
-+ description="QPU/VPU perf summary from VC logging",
-+ epilog = """
-+Will also summarise TMU stalls if logging requests set in qpu noflush param
-+in the profiled code.
-+
-+Example use:
-+ vcgencmd set_logging level=0xc0
-+ <command to profile>
-+ sudo vcdbg log msg >& t.log
-+ v3dusage.py t.log
-+""")
-+
-+ argp.add_argument("logfile")
-+ args = argp.parse_args()
-+
-+ do_logparse(args.logfile)
-+
diff --git a/ffmpeg-rpi/ffmpeg-99.1004-added_upstream_mvc_patches.patch b/ffmpeg-rpi/ffmpeg-99.1004-added_upstream_mvc_patches.patch
deleted file mode 100644
index 551a271..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1004-added_upstream_mvc_patches.patch
+++ /dev/null
@@ -1,284 +0,0 @@
-From 20af7af23a9f366476e67669f14957dfaf58f141 Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Sat, 9 Jan 2016 16:34:09 +0100
-Subject: [PATCH 1/4] avcodec: add h264_mvc codec id and profiles
-
----
- libavcodec/avcodec.h | 3 +++
- libavcodec/codec_desc.c | 7 +++++++
- libavcodec/profiles.c | 1 +
- libavformat/mpegts.c | 2 +-
- 4 files changed, 12 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/avcodec.h b/libavcodec/avcodec.h
-index d962b9cf0a..4c4581c895 100644
---- a/libavcodec/avcodec.h
-+++ b/libavcodec/avcodec.h
-@@ -447,6 +447,8 @@ enum AVCodecID {
- AV_CODEC_ID_GDV,
- AV_CODEC_ID_FITS,
-
-+ AV_CODEC_ID_H264_MVC,
-+
- /* various PCM "codecs" */
- AV_CODEC_ID_FIRST_AUDIO = 0x10000, ///< A dummy id pointing at the start of audio codecs
- AV_CODEC_ID_PCM_S16LE = 0x10000,
-@@ -2895,6 +2897,7 @@ typedef struct AVCodecContext {
- #define FF_PROFILE_H264_HIGH_444_PREDICTIVE 244
- #define FF_PROFILE_H264_HIGH_444_INTRA (244|FF_PROFILE_H264_INTRA)
- #define FF_PROFILE_H264_CAVLC_444 44
-+#define FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH 138
-
- #define FF_PROFILE_VC1_SIMPLE 0
- #define FF_PROFILE_VC1_MAIN 1
-diff --git a/libavcodec/codec_desc.c b/libavcodec/codec_desc.c
-index 79552a910d..b55955476c 100644
---- a/libavcodec/codec_desc.c
-+++ b/libavcodec/codec_desc.c
-@@ -1647,6 +1647,13 @@ static const AVCodecDescriptor codec_descriptors[] = {
- .long_name = NULL_IF_CONFIG_SMALL("FITS (Flexible Image Transport System)"),
- .props = AV_CODEC_PROP_INTRA_ONLY | AV_CODEC_PROP_LOSSLESS,
- },
-+ {
-+ .id = AV_CODEC_ID_H264_MVC,
-+ .type = AVMEDIA_TYPE_VIDEO,
-+ .name = "h264_mvc",
-+ .long_name = NULL_IF_CONFIG_SMALL("H264 MVC"),
-+ .props = AV_CODEC_PROP_LOSSY,
-+ },
-
- /* various PCM "codecs" */
- {
-diff --git a/libavcodec/profiles.c b/libavcodec/profiles.c
-index d7dc960f36..e4651f12f9 100644
---- a/libavcodec/profiles.c
-+++ b/libavcodec/profiles.c
-@@ -72,6 +72,7 @@ const AVProfile ff_h264_profiles[] = {
- { FF_PROFILE_H264_CAVLC_444, "CAVLC 4:4:4" },
- { FF_PROFILE_H264_MULTIVIEW_HIGH, "Multiview High" },
- { FF_PROFILE_H264_STEREO_HIGH, "Stereo High" },
-+ { FF_PROFILE_H264_MULTIVIEW_HIGH_DEPTH, "Multiview High Depth" },
- { FF_PROFILE_UNKNOWN },
- };
-
-diff --git a/libavformat/mpegts.c b/libavformat/mpegts.c
-index 37a6aa8bff..52c5b659c4 100644
---- a/libavformat/mpegts.c
-+++ b/libavformat/mpegts.c
-@@ -701,7 +701,7 @@ static const StreamType ISO_types[] = {
- #endif
- { 0x1b, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
- { 0x1c, AVMEDIA_TYPE_AUDIO, AV_CODEC_ID_AAC },
-- { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264 },
-+ { 0x20, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_H264_MVC },
- { 0x21, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_JPEG2000 },
- { 0x24, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_HEVC },
- { 0x42, AVMEDIA_TYPE_VIDEO, AV_CODEC_ID_CAVS },
---
-2.17.0
-
-
-From 0f3fda4e348e6b12570f5d279713f6da46511846 Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Sat, 9 Jan 2016 16:34:40 +0100
-Subject: [PATCH 2/4] h264_parser: add support for parsing h264 mvc NALUs
-
----
- libavcodec/h264.h | 2 ++
- libavcodec/h264_parser.c | 34 ++++++++++++++++++++++++++++++----
- libavcodec/parser.c | 1 +
- 3 files changed, 33 insertions(+), 4 deletions(-)
-
-diff --git a/libavcodec/h264.h b/libavcodec/h264.h
-index 650580bf3a..c44a0cbedd 100644
---- a/libavcodec/h264.h
-+++ b/libavcodec/h264.h
-@@ -41,7 +41,9 @@ enum {
- H264_NAL_END_STREAM = 11,
- H264_NAL_FILLER_DATA = 12,
- H264_NAL_SPS_EXT = 13,
-+ H264_NAL_SPS_SUBSET = 15,
- H264_NAL_AUXILIARY_SLICE = 19,
-+ H264_NAL_SLICE_EXT = 20,
- };
-
-
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index 1a9840a62c..be8b9db9b0 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -62,6 +62,7 @@ typedef struct H264ParseContext {
- int parse_last_mb;
- int64_t reference_dts;
- int last_frame_num, last_picture_structure;
-+ int is_mvc;
- } H264ParseContext;
-
-
-@@ -109,14 +110,18 @@ static int h264_find_frame_end(H264ParseContext *p, const uint8_t *buf,
- } else if (state <= 5) {
- int nalu_type = buf[i] & 0x1F;
- if (nalu_type == H264_NAL_SEI || nalu_type == H264_NAL_SPS ||
-- nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD) {
-+ nalu_type == H264_NAL_PPS || nalu_type == H264_NAL_AUD ||
-+ nalu_type == H264_NAL_SPS_SUBSET) {
- if (pc->frame_start_found) {
- i++;
- goto found;
- }
- } else if (nalu_type == H264_NAL_SLICE || nalu_type == H264_NAL_DPA ||
-- nalu_type == H264_NAL_IDR_SLICE) {
-+ nalu_type == H264_NAL_IDR_SLICE || (p->is_mvc && nalu_type == H264_NAL_SLICE_EXT)) {
- state += 8;
-+
-+ if (nalu_type == H264_NAL_SLICE_EXT)
-+ i += 3; // skip mvc extension
- continue;
- }
- state = 7;
-@@ -601,7 +606,8 @@ static int h264_parse(AVCodecParserContext *s,
- }
- }
-
-- parse_nal_units(s, avctx, buf, buf_size);
-+ if (!p->is_mvc)
-+ parse_nal_units(s, avctx, buf, buf_size);
-
- if (avctx->framerate.num)
- avctx->time_base = av_inv_q(av_mul_q(avctx->framerate, (AVRational){avctx->ticks_per_frame, 1}));
-@@ -658,7 +664,7 @@ static int h264_split(AVCodecContext *avctx,
- if ((state & 0xFFFFFF00) != 0x100)
- break;
- nalu_type = state & 0x1F;
-- if (nalu_type == H264_NAL_SPS) {
-+ if (nalu_type == H264_NAL_SPS || nalu_type == H264_NAL_SPS_SUBSET) {
- has_sps = 1;
- } else if (nalu_type == H264_NAL_PPS)
- has_pps = 1;
-@@ -710,3 +716,23 @@ AVCodecParser ff_h264_parser = {
- .parser_close = h264_close,
- .split = h264_split,
- };
-+
-+static av_cold int init_mvc(AVCodecParserContext *s)
-+{
-+ H264ParseContext *p = s->priv_data;
-+ int ret = init(s);
-+ if (ret < 0)
-+ return ret;
-+
-+ p->is_mvc = 1;
-+ return 0;
-+}
-+
-+AVCodecParser ff_h264_mvc_parser = {
-+ .codec_ids = { AV_CODEC_ID_H264_MVC },
-+ .priv_data_size = sizeof(H264ParseContext),
-+ .parser_init = init_mvc,
-+ .parser_parse = h264_parse,
-+ .parser_close = h264_close,
-+ .split = h264_split,
-+};
-diff --git a/libavcodec/parser.c b/libavcodec/parser.c
-index f43b197d5e..f96e005ef3 100644
---- a/libavcodec/parser.c
-+++ b/libavcodec/parser.c
-@@ -54,6 +54,7 @@ extern AVCodecParser ff_gsm_parser;
- extern AVCodecParser ff_h261_parser;
- extern AVCodecParser ff_h263_parser;
- extern AVCodecParser ff_h264_parser;
-+extern AVCodecParser ff_h264_mvc_parser;
- extern AVCodecParser ff_hevc_parser;
- extern AVCodecParser ff_mjpeg_parser;
- extern AVCodecParser ff_mlp_parser;
---
-2.17.0
-
-
-From cdd668dc436b9c78dcb31df477e329492356e7ec Mon Sep 17 00:00:00 2001
-From: Hendrik Leppkes <h.leppkes@gmail.com>
-Date: Tue, 28 Nov 2017 16:12:12 +0000
-Subject: [PATCH 3/4] h264_parser: force grabing a new timestamp until a frame
- start was found
-
----
- libavcodec/h264_parser.c | 3 +++
- 1 file changed, 3 insertions(+)
-
-diff --git a/libavcodec/h264_parser.c b/libavcodec/h264_parser.c
-index be8b9db9b0..81c9a1bbae 100644
---- a/libavcodec/h264_parser.c
-+++ b/libavcodec/h264_parser.c
-@@ -594,6 +594,9 @@ static int h264_parse(AVCodecParserContext *s,
- } else {
- next = h264_find_frame_end(p, buf, buf_size, avctx);
-
-+ if (next == END_NOT_FOUND && pc->frame_start_found == 0)
-+ s->fetch_timestamp = 1;
-+
- if (ff_combine_frame(pc, next, &buf, &buf_size) < 0) {
- *poutbuf = NULL;
- *poutbuf_size = 0;
---
-2.17.0
-
-
-From fb0ec9a132d6eb8fd74348ef87b1176c7ca34a00 Mon Sep 17 00:00:00 2001
-From: popcornmix <popcornmix@gmail.com>
-Date: Mon, 28 May 2018 13:35:36 +0100
-Subject: [PATCH 4/4] fixup
-
----
- libavcodec/extract_extradata_bsf.c | 8 +++++---
- 1 file changed, 5 insertions(+), 3 deletions(-)
-
-diff --git a/libavcodec/extract_extradata_bsf.c b/libavcodec/extract_extradata_bsf.c
-index 082b3e749b..7612749efc 100644
---- a/libavcodec/extract_extradata_bsf.c
-+++ b/libavcodec/extract_extradata_bsf.c
-@@ -59,7 +59,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
- HEVC_NAL_VPS, HEVC_NAL_SPS, HEVC_NAL_PPS,
- };
- static const int extradata_nal_types_h264[] = {
-- H264_NAL_SPS, H264_NAL_PPS,
-+ H264_NAL_SPS, H264_NAL_SPS_SUBSET, H264_NAL_PPS,
- };
-
- ExtractExtradataContext *s = ctx->priv_data;
-@@ -90,7 +90,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
- if (nal->type == HEVC_NAL_SPS) has_sps = 1;
- if (nal->type == HEVC_NAL_VPS) has_vps = 1;
- } else {
-- if (nal->type == H264_NAL_SPS) has_sps = 1;
-+ if (nal->type == H264_NAL_SPS || nal->type == H264_NAL_SPS_SUBSET) has_sps = 1;
- }
- } else if (s->remove) {
- filtered_size += nal->raw_size + 3;
-@@ -99,7 +99,7 @@ static int extract_extradata_h2645(AVBSFContext *ctx, AVPacket *pkt,
-
- if (extradata_size &&
- ((ctx->par_in->codec_id == AV_CODEC_ID_HEVC && has_sps && has_vps) ||
-- (ctx->par_in->codec_id == AV_CODEC_ID_H264 && has_sps))) {
-+ ((ctx->par_in->codec_id == AV_CODEC_ID_H264 || ctx->par_in->codec_id == AV_CODEC_ID_H264_MVC) && has_sps))) {
- AVBufferRef *filtered_buf;
- uint8_t *extradata, *filtered_data;
-
-@@ -253,6 +253,7 @@ static const struct {
- } extract_tab[] = {
- { AV_CODEC_ID_CAVS, extract_extradata_mpeg4 },
- { AV_CODEC_ID_H264, extract_extradata_h2645 },
-+ { AV_CODEC_ID_H264_MVC, extract_extradata_h2645 },
- { AV_CODEC_ID_HEVC, extract_extradata_h2645 },
- { AV_CODEC_ID_MPEG1VIDEO, extract_extradata_mpeg12 },
- { AV_CODEC_ID_MPEG2VIDEO, extract_extradata_mpeg12 },
-@@ -317,6 +318,7 @@ static void extract_extradata_close(AVBSFContext *ctx)
- static const enum AVCodecID codec_ids[] = {
- AV_CODEC_ID_CAVS,
- AV_CODEC_ID_H264,
-+ AV_CODEC_ID_H264_MVC,
- AV_CODEC_ID_HEVC,
- AV_CODEC_ID_MPEG1VIDEO,
- AV_CODEC_ID_MPEG2VIDEO,
---
-2.17.0
-
diff --git a/ffmpeg-rpi/ffmpeg-99.1008-dav1d-enable-av1.patch b/ffmpeg-rpi/ffmpeg-99.1008-dav1d-enable-av1.patch
deleted file mode 100644
index 213c7c1..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1008-dav1d-enable-av1.patch
+++ /dev/null
@@ -1,407 +0,0 @@
-diff -Nur a/configure b/configure
---- a/configure 2018-11-23 12:03:27.041287929 -0500
-+++ b/configure 2018-11-23 12:08:52.945786916 -0500
-@@ -226,6 +226,7 @@
- --enable-libcelt enable CELT decoding via libcelt [no]
- --enable-libcdio enable audio CD grabbing with libcdio [no]
- --enable-libcodec2 enable codec2 en/decoding using libcodec2 [no]
-+ --enable-libdav1d enable AV1 decoding via libdav1d [no]
- --enable-libdc1394 enable IIDC-1394 grabbing using libdc1394
- and libraw1394 [no]
- --enable-libfdk-aac enable AAC de/encoding via libfdk-aac [no]
-@@ -1700,6 +1701,7 @@
- libcaca
- libcelt
- libcodec2
-+ libdav1d
- libdc1394
- libdrm
- libflite
-@@ -3062,6 +3064,7 @@
- libcelt_decoder_deps="libcelt"
- libcodec2_decoder_deps="libcodec2"
- libcodec2_encoder_deps="libcodec2"
-+libdav1d_decoder_deps="libdav1d"
- libfdk_aac_decoder_deps="libfdk_aac"
- libfdk_aac_encoder_deps="libfdk_aac"
- libfdk_aac_encoder_select="audio_frame_queue"
-@@ -6003,6 +6006,7 @@
- die "ERROR: libcelt must be installed and version must be >= 0.11.0."; }
- enabled libcaca && require_pkg_config libcaca caca caca.h caca_create_canvas
- enabled libcodec2 && require libcodec2 codec2/codec2.h codec2_create -lcodec2
-+enabled libdav1d && require_pkg_config libdav1d "dav1d >= 0.0.1" "dav1d/dav1d.h" dav1d_version
- enabled libdc1394 && require_pkg_config libdc1394 libdc1394-2 dc1394/dc1394.h dc1394_new
- enabled libdrm && require_pkg_config libdrm libdrm xf86drm.h drmGetVersion
- enabled libfdk_aac && { check_pkg_config libfdk_aac fdk-aac "fdk-aac/aacenc_lib.h" aacEncOpen ||
-diff -Nur a/libavcodec/allcodecs.c b/libavcodec/allcodecs.c
---- a/libavcodec/allcodecs.c 2018-11-23 12:03:27.041287929 -0500
-+++ b/libavcodec/allcodecs.c 2018-11-23 12:11:08.584268221 -0500
-@@ -670,6 +670,7 @@
- extern AVCodec ff_libcelt_decoder;
- extern AVCodec ff_libcodec2_encoder;
- extern AVCodec ff_libcodec2_decoder;
-+extern AVCodec ff_libdav1d_decoder;
- extern AVCodec ff_libfdk_aac_encoder;
- extern AVCodec ff_libfdk_aac_decoder;
- extern AVCodec ff_libgsm_encoder;
-diff -Nur a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
---- a/libavcodec/libdav1d.c 1969-12-31 19:00:00.000000000 -0500
-+++ b/libavcodec/libdav1d.c 2018-11-23 12:33:35.820468086 -0500
-@@ -0,0 +1,346 @@
-+/*
-+ * Copyright (c) 2018 Ronald S. Bultje <rsbultje gmail com>
-+ * Copyright (c) 2018 James Almer <jamrial gmail com>
-+ *
-+ * This file is part of FFmpeg.
-+ *
-+ * FFmpeg is free software; you can redistribute it and/or
-+ * modify it under the terms of the GNU Lesser General Public
-+ * License as published by the Free Software Foundation; either
-+ * version 2.1 of the License, or (at your option) any later version.
-+ *
-+ * FFmpeg is distributed in the hope that it will be useful,
-+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
-+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
-+ * Lesser General Public License for more details.
-+ *
-+ * You should have received a copy of the GNU Lesser General Public
-+ * License along with FFmpeg; if not, write to the Free Software
-+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
-+ */
-+
-+#include <dav1d/dav1d.h>
-+
-+#include "libavutil/avassert.h"
-+#include "libavutil/mastering_display_metadata.h"
-+#include "libavutil/imgutils.h"
-+#include "libavutil/opt.h"
-+
-+#include "avcodec.h"
-+#include "decode.h"
-+#include "internal.h"
-+
-+typedef struct Libdav1dContext {
-+ AVClass *class;
-+ Dav1dContext *c;
-+ AVBufferPool *pool;
-+ int pool_size;
-+
-+ Dav1dData data;
-+ int tile_threads;
-+ int apply_grain;
-+} Libdav1dContext;
-+
-+static const enum AVPixelFormat pix_fmt[][3] = {
-+ [DAV1D_PIXEL_LAYOUT_I400] = { AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAY10, AV_PIX_FMT_GRAY12 },
-+ [DAV1D_PIXEL_LAYOUT_I420] = { AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV420P10, AV_PIX_FMT_YUV420P12 },
-+ [DAV1D_PIXEL_LAYOUT_I422] = { AV_PIX_FMT_YUV422P, AV_PIX_FMT_YUV422P10, AV_PIX_FMT_YUV422P12 },
-+ [DAV1D_PIXEL_LAYOUT_I444] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12 },
-+};
-+
-+static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
-+{
-+ AVCodecContext *c = opaque;
-+
-+ av_vlog(c, AV_LOG_ERROR, fmt, vl);
-+}
-+
-+static int libdav1d_picture_allocator(Dav1dPicture *p, void *cookie)
-+{
-+ Libdav1dContext *dav1d = cookie;
-+ enum AVPixelFormat format = pix_fmt[p->p.layout][p->seq_hdr->hbd];
-+ int ret, linesize[4], h = FFALIGN(p->p.h, 128);
-+ uint8_t *aligned_ptr, *data[4];
-+ AVBufferRef *buf;
-+
-+ ret = av_image_fill_arrays(data, linesize, NULL, format, FFALIGN(p->p.w, 128),
-+ h, DAV1D_PICTURE_ALIGNMENT);
-+ if (ret < 0)
-+ return ret;
-+
-+ if (ret != dav1d->pool_size) {
-+ av_buffer_pool_uninit(&dav1d->pool);
-+ // Use twice the amount of required padding bytes for aligned_ptr below.
-+ dav1d->pool = av_buffer_pool_init(ret + DAV1D_PICTURE_ALIGNMENT * 2, NULL);
-+ if (!dav1d->pool) {
-+ dav1d->pool_size = 0;
-+ return AVERROR(ENOMEM);
-+ }
-+ dav1d->pool_size = ret;
-+ }
-+ buf = av_buffer_pool_get(dav1d->pool);
-+ if (!buf)
-+ return AVERROR(ENOMEM);
-+
-+ // libdav1d requires DAV1D_PICTURE_ALIGNMENT aligned buffers, which av_malloc()
-+ // doesn't guarantee for example when AVX is disabled at configure time.
-+ // Use the extra DAV1D_PICTURE_ALIGNMENT padding bytes in the buffer to align it
-+ // if required.
-+ aligned_ptr = (uint8_t *)FFALIGN((uintptr_t)buf->data, DAV1D_PICTURE_ALIGNMENT);
-+ ret = av_image_fill_pointers(data, format, h, aligned_ptr, linesize);
-+ if (ret < 0) {
-+ av_buffer_unref(&buf);
-+ return ret;
-+ }
-+
-+ p->data[0] = data[0];
-+ p->data[1] = data[1];
-+ p->data[2] = data[2];
-+ p->stride[0] = linesize[0];
-+ p->stride[1] = linesize[1];
-+ p->allocator_data = buf;
-+
-+ return 0;
-+}
-+
-+static void libdav1d_picture_release(Dav1dPicture *p, void *cookie)
-+{
-+ AVBufferRef *buf = p->allocator_data;
-+
-+ av_buffer_unref(&buf);
-+}
-+
-+static av_cold int libdav1d_init(AVCodecContext *c)
-+{
-+ Libdav1dContext *dav1d = c->priv_data;
-+ Dav1dSettings s;
-+ int res;
-+
-+ av_log(c, AV_LOG_INFO, "libdav1d %s\n", dav1d_version());
-+
-+ dav1d_default_settings(&s);
-+ s.logger.cookie = c;
-+ s.logger.callback = libdav1d_log_callback;
-+ s.allocator.cookie = dav1d;
-+ s.allocator.alloc_picture_callback = libdav1d_picture_allocator;
-+ s.allocator.release_picture_callback = libdav1d_picture_release;
-+ s.n_tile_threads = dav1d->tile_threads;
-+ s.apply_grain = dav1d->apply_grain;
-+ s.n_frame_threads = FFMIN(c->thread_count ? c->thread_count : av_cpu_count(), DAV1D_MAX_FRAME_THREADS);
-+
-+ res = dav1d_open(&dav1d->c, &s);
-+ if (res < 0)
-+ return AVERROR(ENOMEM);
-+
-+ return 0;
-+}
-+
-+static void libdav1d_flush(AVCodecContext *c)
-+{
-+ Libdav1dContext *dav1d = c->priv_data;
-+
-+ dav1d_data_unref(&dav1d->data);
-+ dav1d_flush(dav1d->c);
-+}
-+
-+static void libdav1d_data_free(const uint8_t *data, void *opaque) {
-+ AVBufferRef *buf = opaque;
-+
-+ av_buffer_unref(&buf);
-+}
-+
-+static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
-+{
-+ Libdav1dContext *dav1d = c->priv_data;
-+ Dav1dData *data = &dav1d->data;
-+ Dav1dPicture pic = { 0 }, *p = &pic;
-+ int res;
-+
-+ if (!data->sz) {
-+ AVPacket pkt = { 0 };
-+
-+ res = ff_decode_get_packet(c, &pkt);
-+ if (res < 0 && res != AVERROR_EOF)
-+ return res;
-+
-+ if (pkt.size) {
-+ res = dav1d_data_wrap(data, pkt.data, pkt.size, libdav1d_data_free, pkt.buf);
-+ if (res < 0) {
-+ av_packet_unref(&pkt);
-+ return res;
-+ }
-+
-+ data->m.timestamp = pkt.pts;
-+ data->m.offset = pkt.pos;
-+ data->m.duration = pkt.duration;
-+
-+ pkt.buf = NULL;
-+ av_packet_unref(&pkt);
-+ }
-+ }
-+
-+ res = dav1d_send_data(dav1d->c, data);
-+ if (res < 0) {
-+ if (res == AVERROR(EINVAL))
-+ res = AVERROR_INVALIDDATA;
-+ if (res != AVERROR(EAGAIN))
-+ return res;
-+ }
-+
-+ res = dav1d_get_picture(dav1d->c, p);
-+ if (res < 0) {
-+ if (res == AVERROR(EINVAL))
-+ res = AVERROR_INVALIDDATA;
-+ else if (res == AVERROR(EAGAIN) && c->internal->draining)
-+ res = AVERROR_EOF;
-+
-+ return res;
-+ }
-+
-+ av_assert0(p->data[0] != NULL);
-+
-+ // This requires the custom allocator above
-+ frame->buf[0] = av_buffer_ref(p->allocator_data);
-+ if (!frame->buf[0]) {
-+ dav1d_picture_unref(p);
-+ return AVERROR(ENOMEM);
-+ }
-+
-+ frame->data[0] = p->data[0];
-+ frame->data[1] = p->data[1];
-+ frame->data[2] = p->data[2];
-+ frame->linesize[0] = p->stride[0];
-+ frame->linesize[1] = p->stride[1];
-+ frame->linesize[2] = p->stride[1];
-+
-+ c->profile = p->seq_hdr->profile;
-+ frame->format = c->pix_fmt = pix_fmt[p->p.layout][p->seq_hdr->hbd];
-+ frame->width = p->p.w;
-+ frame->height = p->p.h;
-+ if (c->width != p->p.w || c->height != p->p.h) {
-+ res = ff_set_dimensions(c, p->p.w, p->p.h);
-+ if (res < 0)
-+ goto fail;
-+ }
-+
-+ switch (p->seq_hdr->chr) {
-+ case DAV1D_CHR_VERTICAL:
-+ frame->chroma_location = c->chroma_sample_location = AVCHROMA_LOC_LEFT;
-+ break;
-+ case DAV1D_CHR_COLOCATED:
-+ frame->chroma_location = c->chroma_sample_location = AVCHROMA_LOC_TOPLEFT;
-+ break;
-+ }
-+ frame->colorspace = c->colorspace = (enum AVColorSpace) p->seq_hdr->mtrx;
-+ frame->color_primaries = c->color_primaries = (enum AVColorPrimaries) p->seq_hdr->pri;
-+ frame->color_trc = c->color_trc = (enum AVColorTransferCharacteristic) p->seq_hdr->trc;
-+ frame->color_range = c->color_range = p->seq_hdr->color_range ? AVCOL_RANGE_JPEG : AVCOL_RANGE_MPEG;
-+
-+ // match timestamps and packet size
-+ frame->pts = frame->best_effort_timestamp = p->m.timestamp;
-+#if FF_API_PKT_PTS
-+FF_DISABLE_DEPRECATION_WARNINGS
-+ frame->pkt_pts = p->m.timestamp;
-+FF_ENABLE_DEPRECATION_WARNINGS
-+#endif
-+ frame->pkt_dts = p->m.timestamp;
-+ frame->pkt_pos = p->m.offset;
-+ frame->pkt_size = p->m.size;
-+ frame->pkt_duration = p->m.duration;
-+ frame->key_frame = p->frame_hdr->frame_type == DAV1D_FRAME_TYPE_KEY;
-+
-+ switch (p->frame_hdr->frame_type) {
-+ case DAV1D_FRAME_TYPE_KEY:
-+ case DAV1D_FRAME_TYPE_INTRA:
-+ frame->pict_type = AV_PICTURE_TYPE_I;
-+ break;
-+ case DAV1D_FRAME_TYPE_INTER:
-+ frame->pict_type = AV_PICTURE_TYPE_P;
-+ break;
-+ case DAV1D_FRAME_TYPE_SWITCH:
-+ frame->pict_type = AV_PICTURE_TYPE_SP;
-+ break;
-+ default:
-+ res = AVERROR_INVALIDDATA;
-+ goto fail;
-+ }
-+
-+ if (p->mastering_display) {
-+ AVMasteringDisplayMetadata *mastering = av_mastering_display_metadata_create_side_data(frame);
-+ if (!mastering) {
-+ res = AVERROR(ENOMEM);
-+ goto fail;
-+ }
-+
-+ for (int i = 0; i < 3; i++) {
-+ mastering->display_primaries[i][0] = av_make_q(p->mastering_display->primaries[i][0], 1 << 16);
-+ mastering->display_primaries[i][1] = av_make_q(p->mastering_display->primaries[i][1], 1 << 16);
-+ }
-+ mastering->white_point[0] = av_make_q(p->mastering_display->white_point[0], 1 << 16);
-+ mastering->white_point[1] = av_make_q(p->mastering_display->white_point[1], 1 << 16);
-+
-+ mastering->max_luminance = av_make_q(p->mastering_display->max_luminance, 1 << 8);
-+ mastering->min_luminance = av_make_q(p->mastering_display->min_luminance, 1 << 14);
-+
-+ mastering->has_primaries = 1;
-+ mastering->has_luminance = 1;
-+ }
-+ if (p->content_light) {
-+ AVContentLightMetadata *light = av_content_light_metadata_create_side_data(frame);
-+ if (!light) {
-+ res = AVERROR(ENOMEM);
-+ goto fail;
-+ }
-+ light->MaxCLL = p->content_light->max_content_light_level;
-+ light->MaxFALL = p->content_light->max_frame_average_light_level;
-+ }
-+
-+ res = 0;
-+fail:
-+ dav1d_picture_unref(p);
-+ if (res < 0)
-+ av_frame_unref(frame);
-+ return res;
-+}
-+
-+static av_cold int libdav1d_close(AVCodecContext *c)
-+{
-+ Libdav1dContext *dav1d = c->priv_data;
-+
-+ av_buffer_pool_uninit(&dav1d->pool);
-+ dav1d_data_unref(&dav1d->data);
-+ dav1d_close(&dav1d->c);
-+
-+ return 0;
-+}
-+
-+#define OFFSET(x) offsetof(Libdav1dContext, x)
-+#define VD AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_DECODING_PARAM
-+static const AVOption libdav1d_options[] = {
-+ { "tilethreads", "Tile threads", OFFSET(tile_threads), AV_OPT_TYPE_INT, { .i64 = 1 }, 1, DAV1D_MAX_TILE_THREADS, VD },
-+ { "filmgrain", "Apply Film Grain", OFFSET(apply_grain), AV_OPT_TYPE_BOOL, { .i64 = 1 }, 0, 1, VD },
-+ { NULL }
-+};
-+
-+static const AVClass libdav1d_class = {
-+ .class_name = "libdav1d decoder",
-+ .item_name = av_default_item_name,
-+ .option = libdav1d_options,
-+ .version = LIBAVUTIL_VERSION_INT,
-+};
-+
-+AVCodec ff_libdav1d_decoder = {
-+ .name = "libdav1d",
-+ .long_name = NULL_IF_CONFIG_SMALL("dav1d AV1 decoder by VideoLAN"),
-+ .type = AVMEDIA_TYPE_VIDEO,
-+ .id = AV_CODEC_ID_AV1,
-+ .priv_data_size = sizeof(Libdav1dContext),
-+ .init = libdav1d_init,
-+ .close = libdav1d_close,
-+ .flush = libdav1d_flush,
-+ .receive_frame = libdav1d_receive_frame,
-+ .capabilities = AV_CODEC_CAP_DELAY | AV_CODEC_CAP_AUTO_THREADS,
-+ .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_SETS_PKT_DTS,
-+ .priv_class = &libdav1d_class,
-+ .wrapper_name = "libdav1d",
-+};
-diff -Nur a/libavcodec/Makefile b/libavcodec/Makefile
---- a/libavcodec/Makefile 2018-11-23 12:03:27.041287929 -0500
-+++ b/libavcodec/Makefile 2018-11-23 12:10:28.676717867 -0500
-@@ -954,6 +954,7 @@
- OBJS-$(CONFIG_LIBCELT_DECODER) += libcelt_dec.o
- OBJS-$(CONFIG_LIBCODEC2_DECODER) += libcodec2.o codec2utils.o
- OBJS-$(CONFIG_LIBCODEC2_ENCODER) += libcodec2.o codec2utils.o
-+OBJS-$(CONFIG_LIBDAV1D_DECODER) += libdav1d.o
- OBJS-$(CONFIG_LIBFDK_AAC_DECODER) += libfdk-aacdec.o
- OBJS-$(CONFIG_LIBFDK_AAC_ENCODER) += libfdk-aacenc.o
- OBJS-$(CONFIG_LIBGSM_DECODER) += libgsmdec.o
diff --git a/ffmpeg-rpi/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch b/ffmpeg-rpi/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
deleted file mode 100644
index 1d087b4..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1009-dav1d-fix-multithreaded-av1-sw-decoding.patch
+++ /dev/null
@@ -1,60 +0,0 @@
-From 0ae5ba3567a896af2b272e3a52ca574b7f41ec5a Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Wed, 10 Apr 2019 13:40:07 -0700
-Subject: [PATCH 0/1] *** SUBJECT HERE ***
-
-*** BLURB HERE ***
-
-Lukas Rusak (1):
- libavcodec/libdav1d: add libdav1d_get_format method in order to call
- ff_get_format
-
- libavcodec/libdav1d.c | 12 +++++++++++-
- 1 file changed, 11 insertions(+), 1 deletion(-)
-
---
-2.20.1
-
-From 0ae5ba3567a896af2b272e3a52ca574b7f41ec5a Mon Sep 17 00:00:00 2001
-From: Lukas Rusak <lorusak@gmail.com>
-Date: Wed, 10 Apr 2019 13:39:21 -0700
-Subject: [PATCH 1/1] libavcodec/libdav1d: add libdav1d_get_format method in
- order to call ff_get_format
-
----
- libavcodec/libdav1d.c | 12 +++++++++++-
- 1 file changed, 11 insertions(+), 1 deletion(-)
-
-diff --git a/libavcodec/libdav1d.c b/libavcodec/libdav1d.c
-index 30c6eccfef..fa71834543 100644
---- a/libavcodec/libdav1d.c
-+++ b/libavcodec/libdav1d.c
-@@ -48,6 +48,16 @@ static const enum AVPixelFormat pix_fmt[][3] = {
- [DAV1D_PIXEL_LAYOUT_I444] = { AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV444P10, AV_PIX_FMT_YUV444P12 },
- };
-
-+static enum AVPixelFormat libdav1d_get_format(AVCodecContext *avctx, const Dav1dPicture *p)
-+{
-+ enum AVPixelFormat pix_fmts[2], *fmt = pix_fmts;
-+
-+ *fmt++ = pix_fmt[p->p.layout][p->seq_hdr->hbd];
-+ *fmt = AV_PIX_FMT_NONE;
-+
-+ return ff_get_format(avctx, pix_fmts);
-+}
-+
- static void libdav1d_log_callback(void *opaque, const char *fmt, va_list vl)
- {
- AVCodecContext *c = opaque;
-@@ -214,7 +224,7 @@ static int libdav1d_receive_frame(AVCodecContext *c, AVFrame *frame)
- frame->linesize[2] = p->stride[1];
-
- c->profile = p->seq_hdr->profile;
-- frame->format = c->pix_fmt = pix_fmt[p->p.layout][p->seq_hdr->hbd];
-+ frame->format = c->pix_fmt = libdav1d_get_format(c, p);
- frame->width = p->p.w;
- frame->height = p->p.h;
- if (c->width != p->p.w || c->height != p->p.h) {
---
-2.20.1
-
diff --git a/ffmpeg-rpi/ffmpeg-99.1010-yuv2rgb-logspam.patch b/ffmpeg-rpi/ffmpeg-99.1010-yuv2rgb-logspam.patch
deleted file mode 100644
index 2895d7a..0000000
--- a/ffmpeg-rpi/ffmpeg-99.1010-yuv2rgb-logspam.patch
+++ /dev/null
@@ -1,13 +0,0 @@
---- a/libswscale/yuv2rgb.c 2018-07-22 10:00:00.000000000 +0100
-+++ b/libswscale/yuv2rgb.c 2018-08-20 11:55:46.391543992 +0100
-@@ -687,10 +687,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsConte
- if (t)
- return t;
-
-- av_log(c, AV_LOG_WARNING,
-- "No accelerated colorspace conversion found from %s to %s.\n",
-- av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
--
- switch (c->dstFormat) {
- case AV_PIX_FMT_BGR48BE:
- case AV_PIX_FMT_BGR48LE: