Welcome to Doom9's Forum, THE in-place to be for everyone interested in DVD conversion. Before you start posting please read the forum rules. By posting to this forum you agree to abide by the rules. |
![]() |
#1 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
HBD/OPTIMIZATION Request :)
I did this simple sharpening filter who does a 3x3 approximated gaussian blur and adds back the difference, reducing it by heuristic means to not get halo.
Now my request is: can someone HBD this simple plugin and give me some tips on how to make it faster? Code:
#pragma once #include <cmath> #include <windows.h> #include <avisynth.h> using namespace std; struct CSharpenFilter { PVideoFrame dst; PVideoFrame src; int height; int row_size; int src_pitch; int dst_pitch; unsigned char* dstp; const unsigned char* srcc; const unsigned char* srcp; const unsigned char* srcn; void GetFrame(PClip child, int n, VideoInfo vi, IScriptEnvironment* env) { src = child->GetFrame(n,env); dst = env->NewVideoFrame(vi); } void GetPlaneY() { dstp = dst->GetWritePtr(PLANAR_Y); dst_pitch = src->GetPitch(PLANAR_Y); srcc = src->GetReadPtr(PLANAR_Y); srcp = src->GetReadPtr(PLANAR_Y); srcn = src->GetReadPtr(PLANAR_Y); src_pitch = src->GetPitch(PLANAR_Y); height = src->GetHeight(PLANAR_Y); row_size = src->GetRowSize(PLANAR_Y); } void GetPlaneU() { dstp = dst->GetWritePtr(PLANAR_U); dst_pitch = src->GetPitch(PLANAR_U); srcc = src->GetReadPtr(PLANAR_U); srcp = src->GetReadPtr(PLANAR_U); srcn = src->GetReadPtr(PLANAR_U); src_pitch = src->GetPitch(PLANAR_U); height = src->GetHeight(PLANAR_U); row_size = src->GetRowSize(PLANAR_U); } void GetPlaneV() { dstp = dst->GetWritePtr(PLANAR_V); dst_pitch = src->GetPitch(PLANAR_V); srcc = src->GetReadPtr(PLANAR_V); srcp = src->GetReadPtr(PLANAR_V); srcn = src->GetReadPtr(PLANAR_V); src_pitch = src->GetPitch(PLANAR_V); height = src->GetHeight(PLANAR_V); row_size = src->GetRowSize(PLANAR_V); } void CopyPlane() { int x, y; for (y = 0; y < height; y++) { for (x = 0; x < row_size; x++) { dstp[x] = srcc[x]; } dstp += dst_pitch; srcc += src_pitch; } } //CORE FILTER SUPPORT FUNCTIONS int AddDiff(int x, int y, int nt, int mode) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; i = j < nt ? 0 : i; j = j < nt ? 0 : j; //REDUCE DIFFERENCE; i = mode > 0 ? (int)sqrt(j) : i; i = mode > 1 ? (int)atan(j) * i : i; //ADD DIFFERENCE i = i * k; i = min(255, x + i); i = max( 0, i); return i; } void CoreFilter(int nt, int mode) { int x, y, i, j, k; //START PITCH; srcp -= src_pitch; srcn += src_pitch; //FILTER FIRST ROW; for (x = 0; x < row_size; x++) { j = max(0, x - 1); k = min(row_size - 1, x + 1); i = srcc[j] + srcc[x] * 2 + srcc[k]; i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2; i += srcn[j] + srcn[x] * 2 + srcn[k]; i = (int)(i / 16.0f + 0.5f); dstp[x] = AddDiff(srcc[x], i, nt, mode); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; //FILTER MAIN ROWS; for (y = 1; y < height - 1; y++) { for (x = 0; x < row_size; x++) { j = max(0, x - 1); k = min(row_size - 1, x + 1); i = srcp[j] + srcp[x] * 2 + srcp[k]; i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2; i += srcn[j] + srcn[x] * 2 + srcn[k]; i = (int)(i / 16.0f + 0.5f); dstp[x] = AddDiff(srcc[x], i, nt, mode); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } //FILTER LAST ROW; for (x = 0; x < row_size; x++) { j = max(0, x - 1); k = min(row_size - 1, x + 1); i = srcp[j] + srcp[x] * 2 + srcp[k]; i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2; i += srcc[j] + srcc[x] * 2 + srcc[k]; i = (int)(i / 16.0f + 0.5f); dstp[x] = AddDiff(srcc[x], i, nt, mode); } } }; class CSharpen : public GenericVideoFilter { int nt, mode; bool Y, U, V; public: CSharpen(PClip _child, int _nt, int _mode, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), nt(_nt), mode(_mode), Y(_Y), U(_U), V(_V) { if (!vi.IsYUV()) { env->ThrowError("CSharpen: supported colorspaces are Y8, YV12, YV16, YV24!"); } else if (nt < 0 || nt > 255) { env->ThrowError("CSharpen: nt avaible range is [0, 255]!"); } else if (mode < 0 || mode > 2) { env->ThrowError("CSharpen: mode avaible mode values are 0, 1, 2!"); } } PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) { CSharpenFilter Frame; Frame.GetFrame(child, n, vi, env); if (Y) Frame.GetPlaneY(); if (Y) Frame.CoreFilter(nt, mode); if (!vi.IsY8()) { Frame.GetPlaneU(); U ? Frame.CoreFilter(nt, mode) : Frame.CopyPlane(); Frame.GetPlaneV(); V ? Frame.CoreFilter(nt, mode) : Frame.CopyPlane(); } return Frame.dst; } }; const AVS_Linkage* AVS_linkage = 0; AVSValue __cdecl Create_CSharpen(AVSValue args, void* user_data, IScriptEnvironment* env) { return new CSharpen(args[0].AsClip(),args[1].AsInt(3),args[2].AsInt(2),args[3].AsBool(true),args[4].AsBool(false),args[5].AsBool(false),env); } extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; env->AddFunction("CSharpen", "c[nt]i[mode]i[Y]b[U]b[V]b", Create_CSharpen, 0); return "CSharpen"; }
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#2 | Link |
Registered User
Join Date: Jan 2018
Posts: 1,927
|
Ideas from Asd-g
https://github.com/Asd-g/AviSynthPlu...ent-1404461580 |
![]() |
![]() |
![]() |
#3 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
To make it faster onCPU you need to use SIMD co-processor.
So in the plugin init you put some selector of the main processing function for each combination of input params and SIMD co-processor available. Like bitdepth/SIMDfamily/mode/etc. And call selected function as processing each frame at GetFrame(). Example of selector is https://github.com/pinterf/mvtools/b...ocks.cpp#L3756 First you put to SIMD the 3x3 blocks convolution: Code:
i = srcp[j] + srcp[x] * 2 + srcp[k]; i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2; i += srcn[j] + srcn[x] * 2 + srcn[k]; i = (int)(i / 16.0f + 0.5f); Code:
for (x = 0; x < row_size; x+=iBlocksPerPass) 3x3 blocks with +1 x stepping (x++ in original program) are overlapping so it is better to use less SIMD register file loads from cache to process several horizontal blocks in single SIMD pass to save time. Also the coefficients for convolution are constants for all blocks. Number of sequential blocks to process depends on the 'width' of dataword for SIMD coprocessor used so different for each SSE2/AVX2/AVX512 functions versions. Also with 'large' architectures like AVX512 with 32 directly addressed by instructions datawords (of 512bits each) available it may be possible to process several sets of blocks at single pass. So iBlocksPerPass for AVX512 may be much larger in compare with possibly 2x difference between SSE2 and AVX2 versions. At the output of SIMD blocks convolution you get vector of i[n] elements from each block and better to replace AddDiff(srcc[x], i, nt, mode); function to SIMD input and output (final saving) too. The AddDiff() is much more complex for SIMD co-processor so the only help is approximate SQRT instruction available. No atan(). So mode=atan() will mostly probably will be C-only and much slower. Because saving values depend on bitdepth it is better to put storing to memory inside function so it better to be AddDiffAndStoreResultVector() or even Process_i_vector(). Also the virtual vector or 'return' values of SIMD blocks convolution will be 'real SIMD register' so it is good to check if compiler really use 'register' transfer and not cache store-load or better not use AddDiff call at all and inline it manually in the main SIMD program. Inlining may require to make more versions of separated functions for main selector. So to support 8/16/32 bitdepth and SSE2/AVX2/AVX512(F) it is required to make at least 3x3=9 versions of main processing to select. If adding 'mode' of 3 - it is 3x3x3 functions. Or 3x3x2 if making condition Code:
i = mode > 0 ? (int)sqrt(j) : i; Your current program loaded with lots of conditional branching - it make program text much smaller but chip execution slower. If possible all non-realtime conditional branching with some rare changing params or even single set params for all frame pass better to make separate program blocks. It make program text much larger but helps to execution speed. Though with compiler analyser like godbolt you can try to check if some wise enough compiler can use runtime conditional instructions for simple enough tasks like min()/max() instead of conditional jumps. Last edited by DTL; 26th January 2023 at 12:49. |
![]() |
![]() |
![]() |
#4 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
Thanks DTL! I will update with your tips in the evening! I will post the update tomorrow
![]()
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#5 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
Also for integer samples
Code:
i = (int)(i / 16.0f + 0.5f); Code:
i = i >> 4; For 8bit unsigned samples up to 255 Code:
i = srcp[j] + srcp[x] * 2 + srcp[k]; i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2; i += srcn[j] + srcn[x] * 2 + srcn[k]; Float calculation you will need only in the float32 samples function. Also +0.5f for better rounding not needed for floats. For float32 samples version it is much better to use something like Code:
const float my1div16=1.0f / 16.0f; i *= my1div16; Code:
i *= 0.0625f; // 1.0f / 16.0f; Last edited by DTL; 26th January 2023 at 18:38. |
![]() |
![]() |
![]() |
#6 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
Almost 3x speed:
Code:
#pragma once #include <windows.h> #include <avisynth.h> using namespace std; struct CSharpenFilter { VideoInfo vi; PVideoFrame dst; PVideoFrame src; int height; int row_size; int src_pitch; int dst_pitch; unsigned char* dstp; const unsigned char* srcc; const unsigned char* srcp; const unsigned char* srcn; void GetFrame(PClip child, int n, VideoInfo info, IScriptEnvironment* env) { vi = info; src = child->GetFrame(n,env); dst = env->NewVideoFrame(vi); } void GetPlaneY() { dstp = dst->GetWritePtr(PLANAR_Y); dst_pitch = src->GetPitch(PLANAR_Y); srcc = src->GetReadPtr(PLANAR_Y); srcp = src->GetReadPtr(PLANAR_Y); srcn = src->GetReadPtr(PLANAR_Y); src_pitch = src->GetPitch(PLANAR_Y); height = src->GetHeight(PLANAR_Y); row_size = src->GetRowSize(PLANAR_Y); } void GetPlaneU() { dstp = dst->GetWritePtr(PLANAR_U); dst_pitch = src->GetPitch(PLANAR_U); srcc = src->GetReadPtr(PLANAR_U); srcp = src->GetReadPtr(PLANAR_U); srcn = src->GetReadPtr(PLANAR_U); src_pitch = src->GetPitch(PLANAR_U); height = src->GetHeight(PLANAR_U); row_size = src->GetRowSize(PLANAR_U); } void GetPlaneV() { dstp = dst->GetWritePtr(PLANAR_V); dst_pitch = src->GetPitch(PLANAR_V); srcc = src->GetReadPtr(PLANAR_V); srcp = src->GetReadPtr(PLANAR_V); srcn = src->GetReadPtr(PLANAR_V); src_pitch = src->GetPitch(PLANAR_V); height = src->GetHeight(PLANAR_V); row_size = src->GetRowSize(PLANAR_V); } void CopyPlane() { memcpy(dstp, srcp, row_size * height); } //CORE FILTER FUNCTIONS; int Clamp(int x) { return max(0, min(255, x)); } int fast_sqrt(const int n) { for (int j = 2;;j++) { if (j * j > n) return j - 1; } } int fast_atan(const float n) { return 1.57 - 1 / n; } int AddDiff0(int x, int y, int nt, int str) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; j = j < nt ? 0 : j; if (j == 0) return x; //BOOST DIFFERENCE AND CHANGE SIGN; j = j * str * k; //CLAMP DIFFERENCE j = Clamp(x + j); return j; } int AddDiff1(int x, int y, int nt, int str) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; j = j < nt ? 0 : j; if (j == 0) return x; //BOOST DIFFERENCE; j = j * str; //REDUCE DIFFERENCE; j = fast_sqrt(j); //CHANGE SIGN; j = j * k; //CLAMP DIFFERENCE j = Clamp(x + j); return j; } int AddDiff2(int x, int y, int nt, int str) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; j = j < nt ? 0 : j; if (j == 0) return x; //BOOST DIFFERENCE; j = j * str; //REDUCE DIFFERENCE; j = int(fast_sqrt(j) * fast_atan(j)); //CHANGE SIGN; j = j * k; //CLAMP DIFFERENCE j = Clamp(x + j); return j; } void BoxBlur() { int x, y, i; //START PITCH; srcp -= src_pitch; srcn += src_pitch; //COPY FIRST ROW; for (x = 0; x < row_size; x++) { dstp[x] = srcc[x]; } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; //FILTER MAIN ROWS; for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] + srcp[x + 1]; i += srcc[x - 1] + srcc[x] + srcc[x + 1]; i += srcn[x - 1] + srcn[x] + srcn[x + 1]; dstp[x] = int(i * 0.1111f + 0.5f); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } //COPY LAST ROW; for (x = 0; x < row_size; x++) { dstp[x] = srcc[x]; } dstp -= dst_pitch * (height - 1); srcp -= src_pitch * (height - 1); srcc -= src_pitch * (height - 1); srcn -= src_pitch * (height - 1); } void GaussBlur() { int x, y, i; //START PITCH; srcp -= src_pitch; srcn += src_pitch; //COPY FIRST ROW; for (x = 0; x < row_size; x++) { dstp[x] = srcc[x]; } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; //FILTER MAIN ROWS; for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1]; i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2; i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1]; dstp[x] = int(i * 0.0625f + 0.5f); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } //COPY LAST ROW; for (x = 0; x < row_size; x++) { dstp[x] = srcc[x]; } dstp -= dst_pitch * (height - 1); srcp -= src_pitch * (height - 1); srcc -= src_pitch * (height - 1); srcn -= src_pitch * (height - 1); } void CoreFilter(int nt, int str, int smode, int bmode) { bmode ? GaussBlur() : BoxBlur(); if (smode == 0) { for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = AddDiff0(srcc[x], dstp[x], nt, str); } dstp += dst_pitch; srcc += src_pitch; } } else if (smode == 1) { for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = AddDiff1(srcc[x], dstp[x], nt, str); } dstp += dst_pitch; srcc += src_pitch; } } else if (smode == 2) { for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = AddDiff2(srcc[x], dstp[x], nt, str); } dstp += dst_pitch; srcc += src_pitch; } } } }; class CSharpen : public GenericVideoFilter { int str, smode, bmode, nt; bool Y, U, V; public: CSharpen(PClip _child, int _str, int _smode, int _bmode, int _nt, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), str(_str), nt(_nt), smode(_smode), bmode(_bmode), Y(_Y), U(_U), V(_V) { if (!vi.IsYUV()) { env->ThrowError("CSharpen: supported colorspaces are Y8, YV12, YV16, YV24!"); } else if (nt < 0 || nt > 255) { env->ThrowError("CSharpen: nt avaible range is [0, 255]!"); } else if (str < 1 || str > 255) { env->ThrowError("CSharpen: str avaible range is [1, 255]!"); } else if (smode < 0 || smode > 2) { env->ThrowError("CSharpen: mode avaible mode values are 0, 1, 2!"); } else if (bmode < 0 || bmode > 1) { env->ThrowError("CSharpen: mode avaible mode values are 0, 1!"); } } PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) { CSharpenFilter Frame; Frame.GetFrame(child, n, vi, env); Frame.GetPlaneY(); Y ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane(); if (!vi.IsY8()) { Frame.GetPlaneU(); U ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane(); Frame.GetPlaneV(); V ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane(); } return Frame.dst; } }; const AVS_Linkage* AVS_linkage = 0; AVSValue __cdecl Create_CSharpen(AVSValue args, void* user_data, IScriptEnvironment* env) { return new CSharpen(args[0].AsClip(),args[1].AsInt(10),args[2].AsInt(2),args[3].AsInt(1),args[4].AsInt(3),args[5].AsBool(true),args[6].AsBool(false),args[7].AsBool(false),env); } extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; env->AddFunction("CSharpen", "c[str]i[smode]i[bmode]i[nt]i[Y]b[U]b[V]b", Create_CSharpen, 0); return "CSharpen"; }
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#7 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
In
Code:
dstp[x] = int(i * 0.0625f + 0.5f); Code:
dstp[x] = i >> 4; You will need i * 0.0625f only for float32 version of functions when you will add HBD of float32 samples. And your 'i' accumulator will be of 'float' type. " if you see something off in this pure C++ code" I really not like idea of 2 pass large memory read/write with separated 'blur' pass and 'adddiff' pass. It will work about good until all your saved 'blur' data in dstp[] buffer still fit into CPU cache, but may significantly drop speed when your frame size will be large so dstp[] buff will be (mostly) trashed from cache to very slow main RAM. The previous version with single pass processing of blur+adddiff in single pass may work better with large frame sizes and at CPUs with small enouth caches. For optimization tips it is useful to read very big but still useful intel document named "Intel® 64 and IA-32 Architectures Optimization Reference Manual'. Try to found most new version for new architectures. May be https://www.intel.com/content/www/us...intel-sdm.html and https://cdrdv2.intel.com/v1/dl/getContent/671488 Also depending on your C compiler it may be useful to add forced inline directives for small helper functions like Clamp(), fast_sqrt() and others with frequent calling. See documentation on your compiler for nominal forced inline directives. It may be something like __inline__ before function declaration. For some set of compilers you can try to use MV_FORCEINLINE macro from mvtools: Code:
#ifndef MV_FORCEINLINE #if defined(__clang__) // Check clang first. clang-cl also defines __MSC_VER // We set MSVC because they are mostly compatible # define CLANG #if defined(_MSC_VER) # define MSVC # define MV_FORCEINLINE __attribute__((always_inline)) inline #else # define MV_FORCEINLINE __attribute__((always_inline)) inline #endif #elif defined(_MSC_VER) # define MSVC # define MSVC_PURE # define MV_FORCEINLINE __forceinline #elif defined(__GNUC__) # define GCC # define MV_FORCEINLINE __attribute__((always_inline)) inline #else # error Unsupported compiler. # define MV_FORCEINLINE inline # undef __forceinline # define __forceinline inline #endif #endif Last edited by DTL; 27th January 2023 at 00:17. |
![]() |
![]() |
![]() |
#8 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
Visual studio has alreade a __forceinline command as far as microsoft says, so I guess I don't need the macro. Also, visual studio has a #pragma omp simd who gave me a small speed up. I suppose is better if you do it yourself, but better than nothing.
ATM on a bluray: ffvideosource only about 66fps CSharpen about 57fps, so it's not going so bad (I guess ![]() Here the code: Code:
#pragma once #include <windows.h> #include <avisynth.h> using namespace std; struct CSharpenFilter { VideoInfo vi; PVideoFrame dst; PVideoFrame src; int height; int row_size; int src_pitch; int dst_pitch; unsigned char* dstp; const unsigned char* srcc; const unsigned char* srcp; const unsigned char* srcn; void GetFrame(PClip child, int n, VideoInfo info, IScriptEnvironment* env) { vi = info; src = child->GetFrame(n,env); dst = env->NewVideoFrame(vi); } void GetPlaneY() { dstp = dst->GetWritePtr(PLANAR_Y); dst_pitch = src->GetPitch(PLANAR_Y); srcc = src->GetReadPtr(PLANAR_Y); srcp = src->GetReadPtr(PLANAR_Y); srcn = src->GetReadPtr(PLANAR_Y); src_pitch = src->GetPitch(PLANAR_Y); height = src->GetHeight(PLANAR_Y); row_size = src->GetRowSize(PLANAR_Y); } void GetPlaneU() { dstp = dst->GetWritePtr(PLANAR_U); dst_pitch = src->GetPitch(PLANAR_U); srcc = src->GetReadPtr(PLANAR_U); srcp = src->GetReadPtr(PLANAR_U); srcn = src->GetReadPtr(PLANAR_U); src_pitch = src->GetPitch(PLANAR_U); height = src->GetHeight(PLANAR_U); row_size = src->GetRowSize(PLANAR_U); } void GetPlaneV() { dstp = dst->GetWritePtr(PLANAR_V); dst_pitch = src->GetPitch(PLANAR_V); srcc = src->GetReadPtr(PLANAR_V); srcp = src->GetReadPtr(PLANAR_V); srcn = src->GetReadPtr(PLANAR_V); src_pitch = src->GetPitch(PLANAR_V); height = src->GetHeight(PLANAR_V); row_size = src->GetRowSize(PLANAR_V); } void CopyPlane() { memcpy(dstp, srcc, src_pitch * height); } //CORE FILTER FUNCTIONS; __forceinline int clamp(int x) { return max(0, min(255, x)); } __forceinline int fast_sqrt(const int n) { for (int j = 2;;j++) { if (j * j > n) return j - 1; } } __forceinline int fast_atan(const float n) { return 1.57f - 1 / n; } inline int AddDiff0(int x, int y, int nt, int str) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; j = j < nt ? 0 : j; if (j == 0) return x; //BOOST DIFFERENCE AND CHANGE SIGN; j = j * str * k; //CLAMP DIFFERENCE j = clamp(x + j); return j; } inline int AddDiff1(int x, int y, int nt, int str) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; j = j < nt ? 0 : j; if (j == 0) return x; //BOOST DIFFERENCE; j = j * str; //REDUCE DIFFERENCE; j = fast_sqrt(j); //CHANGE SIGN; j = j * k; //CLAMP DIFFERENCE j = clamp(x + j); return j; } inline int AddDiff2(int x, int y, int nt, int str) { int i, j, k; //GETS DIFF, ABS DIFF, SIGN DIFF; i = x - y; j = abs(i); k = (i > 0) - (i < 0); //SET TO 0 LOW FREQUENCY; j = j < nt ? 0 : j; if (j == 0) return x; //BOOST DIFFERENCE; j = j * str; //REDUCE DIFFERENCE; j = int(fast_sqrt(j) * fast_atan(j)); //CHANGE SIGN; j = j * k; //CLAMP DIFFERENCE j = clamp(x + j); return j; } void BoxBlur(int nt, int str, int smode) { int x, y, i; //START PITCH; srcp -= src_pitch; srcn += src_pitch; //COPY FIRST ROW; memcpy(dstp, srcc, row_size); dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; //FILTER MAIN ROWS; if (smode == 0) { for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; #pragma omp simd for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] + srcp[x + 1]; i += srcc[x - 1] + srcc[x] + srcc[x + 1]; i += srcn[x - 1] + srcn[x] + srcn[x + 1]; dstp[x] = AddDiff0(srcc[x], int(i * 0.1111f + 0.5f), nt, str); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } } else if (smode == 1) { for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; #pragma omp simd for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] + srcp[x + 1]; i += srcc[x - 1] + srcc[x] + srcc[x + 1]; i += srcn[x - 1] + srcn[x] + srcn[x + 1]; dstp[x] = AddDiff1(srcc[x], int(i * 0.1111f + 0.5f), nt, str); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } } else if (smode == 2) { for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; #pragma omp simd for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] + srcp[x + 1]; i += srcc[x - 1] + srcc[x] + srcc[x + 1]; i += srcn[x - 1] + srcn[x] + srcn[x + 1]; dstp[x] = AddDiff2(srcc[x], int(i * 0.1111f + 0.5f), nt, str); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } } //COPY LAST ROW; for (x = 0; x < row_size; x++) { memcpy(dstp, srcc, row_size); } } void GaussBlur(int nt, int str, int smode) { int x, y, i; //START PITCH; srcp -= src_pitch; srcn += src_pitch; //COPY FIRST ROW; memcpy(dstp, srcc, row_size); dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; //FILTER MAIN ROWS; if (smode == 0) { for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; #pragma omp simd for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1]; i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2; i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1]; dstp[x] = AddDiff0(srcc[x], i >> 4, nt, str); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } } else if (smode == 1) { for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; #pragma omp simd for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1]; i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2; i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1]; dstp[x] = AddDiff1(srcc[x], i >> 4, nt, str); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } } else if (smode == 2) { for (y = 1; y < height - 1; y++) { dstp[0] = srcc[0]; dstp[row_size - 1] = srcc[row_size - 1]; #pragma omp simd for (x = 1; x < row_size - 1; x++) { i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1]; i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2; i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1]; dstp[x] = AddDiff2(srcc[x], i >> 4, nt, str); } dstp += dst_pitch; srcp += src_pitch; srcc += src_pitch; srcn += src_pitch; } } //COPY LAST ROW; memcpy(dstp, srcc, row_size); } void CoreFilter(int nt, int str, int smode, int bmode) { if (!bmode) { BoxBlur(nt, str, smode); } else { GaussBlur(nt, str, smode); } } }; class CSharpen : public GenericVideoFilter { int str, smode, bmode, nt; bool Y, U, V; public: CSharpen(PClip _child, int _str, int _smode, int _bmode, int _nt, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), str(_str), nt(_nt), smode(_smode), bmode(_bmode), Y(_Y), U(_U), V(_V) { if (!vi.IsYUV()) { env->ThrowError("CSharpen: supported colorspaces are Y8, YV12, YV16, YV24!"); } else if (nt < 0 || nt > 255) { env->ThrowError("CSharpen: nt avaible range is [0, 255]!"); } else if (str < 1 || str > 255) { env->ThrowError("CSharpen: str avaible range is [1, 255]!"); } else if (smode < 0 || smode > 2) { env->ThrowError("CSharpen: mode avaible mode values are 0, 1, 2!"); } else if (bmode < 0 || bmode > 1) { env->ThrowError("CSharpen: mode avaible mode values are 0, 1!"); } } PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) { CSharpenFilter Frame; Frame.GetFrame(child, n, vi, env); Frame.GetPlaneY(); Y ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane(); if (!vi.IsY8()) { Frame.GetPlaneU(); U ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane(); Frame.GetPlaneV(); V ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane(); } return Frame.dst; } }; const AVS_Linkage* AVS_linkage = 0; AVSValue __cdecl Create_CSharpen(AVSValue args, void* user_data, IScriptEnvironment* env) { return new CSharpen(args[0].AsClip(),args[1].AsInt(10),args[2].AsInt(2),args[3].AsInt(1),args[4].AsInt(3),args[5].AsBool(true),args[6].AsBool(false),args[7].AsBool(false),env); } extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; env->AddFunction("CSharpen", "c[str]i[smode]i[bmode]i[nt]i[Y]b[U]b[V]b", Create_CSharpen, 0); return "CSharpen"; } ![]()
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#10 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
Thanks is the whole point of this filter
![]()
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#11 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
To add HBD you add copy of your processing functions with 16bit and float32 input and output for source and target planes. The copy over templating may be recommended because of later adding different SIMD to different functions.
At plugin startup you place selector based on input bitdepth to select the function to use. May be the source for selector to detect the input bitdepth may be vi.componentsize of the input clip. As I see from MAnalyse: Code:
pixelsize = vi.ComponentSize(); if vi.ComponentSize() == 2 - you use 16bit (unsigned short pointers), if vi.ComponentSize() == 4 - you use float32 (float pointers and float processing). Make copy of your CoreFilter() function like CoreFilter_8() , CoreFilter_16() and CoreFilter_Float(). I think all >8 to 16bit like 10,14,16 may be processed with single 16bit function. For Code:
__forceinline int clamp(int x) { return max(0, min(255, x)); } Code:
__forceinline int clamp_int(int x) { return max(0, min(((1 << bits_per_pixel) - 1), x)); } __forceinline float clamp_float(float x) { return max(0, min(1.0f, x)); } The GetRead/GetWrite ptrs functions may always return unsigned char pointer so you cast it to unsigned short or float if your ComponentSize not 8 but 10..16 or float. The Pitch may be 'real' bytes measured so you need to adjust it if use as operand with short and float pointers or C compiler will make datasize calculation. So for short pointer num_of_shorts=pitch/2 and for float pointer num_of_floats=pitch/4. RowSize looks like the same. So for 16bit you change 'unsigned char' type of input and output pointers to 'unsigned short' and for float32 to 'float'. Also for float version of function change internal processing to float. For 16bit input/output the int is enough I think. To use SIMD with intrinsics you typically add immintrin.h include and with not very old VisualStudio can use up to AVX2 (or may be including many AVX512) types and pseudofunctions. Example of simple C that I use to design and debug SIMD functions is https://github.com/DTL2020/Sub_shift...sm_test002.cpp it have only Code:
#include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3 The 128bit integer type is __m128i. I typically use online (or downloadable offline available) intrinsics web-help like https://www.laruence.com/sse/ . #pragma omp before for() loop make you local internal multithreading (cut row to threads) but the 'degree of SIMD' need to be checked in disassembler. Also the too small workunits for MT may be not very good so you can MT the sets of rows (as work in internal MT in mvtools) - try to put #pragma omp before y-loop. But for MT you need local pointers and local variables for each thread - something like Code:
#pragma omp simd for (y = 1; y < height - 1; y++) { // use local data each thread int (short, float..) data; // calculate each thread ptrs from y-var unsigned char* l_dstp = dstp + y * dst_pitch; // unsigned short* and float* for 16bit and float32 unsigned char* l_srcp = srcp + y * src_pitch; unsigned char* l_srcc = srcc + y * src_pitch; unsigned char* l_srcn = srcn + y * src_pitch; // dstp[0] = srcc[0]; l_dstp[0] = l_srcc[0]; // and so all others ptrs l_dstp[row_size - 1] = l_srcc[row_size - 1]; for (x = 1; x < row_size - 1; x++) { data = l_srcp[x - 1] + l_srcp[x] + l_srcp[x + 1]; data += l_srcc[x - 1] + l_srcc[x] + l_srcc[x + 1]; data += l_srcn[x - 1] + l_srcn[x] + l_srcn[x + 1]; l_dstp[x] = AddDiff0(l_srcc[x], int(i * 0.1111f + 0.5f), nt, str); } } Updated: replaced array of ptrs into MT part to locally calculated ptrs. Last edited by DTL; 27th January 2023 at 17:06. |
![]() |
![]() |
![]() |
#12 | Link |
HeartlessS Usurer
Join Date: Dec 2009
Location: Over the rainbow
Posts: 10,812
|
From the little I remember about C++ (about 4 weeks of study back in 1996), when you fully define a member function inside a class declaration,
it is a hint for the complier that you want it in-lined {the compiler is not compelled to inline it, and may only do it if a reasonably small function}. However, C++ has changed a bit since the 90's. Thanks for the thread guys, is quite interesting, and maybe a potential sticky contender for optimising code for HBD and SIMD. EDIT: "sticky contender", Ideally, it would have been a more simple filter like a simple Average() [or similar], to better concentrate on the optimisation. EDIT: Or more simple Invert() style filter.
__________________
I sometimes post sober. StainlessS@MediaFire ::: AND/OR ::: StainlessS@SendSpace "Some infinities are bigger than other infinities", but how many of them are infinitely bigger ??? Last edited by StainlessS; 27th January 2023 at 10:32. |
![]() |
![]() |
![]() |
#13 | Link |
Registered User
Join Date: Mar 2012
Location: Texas
Posts: 1,646
|
I'm not a programmer but I thought I'd share this. While working on the avs+ docs, I had to scroll back through the commit history. From there you can see how the code changed for all of the filters. For example, here is the first change pinterf did to the blur/sharpen filters to support 16-bit: https://github.com/AviSynth/AviSynth...8f4d95b931c7f8. Here's the "luma" mode of Histogram when HBD was added: https://github.com/AviSynth/AviSynth...4eaaab12918e81. All of the internals filters have these changes to look at, starting when they were 8-bit only.
|
![]() |
![]() |
![]() |
#14 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
This SEEMS to work, but of course it's probably not how you are supposed to do it; BTW, thanks for all the info, when I figured out this HBD stuff, I will treasure them.
Code:
#include <windows.h> #include <avisynth.h> class InvertNeg : public GenericVideoFilter { public: InvertNeg(PClip _child, IScriptEnvironment* env) : GenericVideoFilter(_child) { } PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) { PVideoFrame dst = env->NewVideoFrame(vi); PVideoFrame src = child->GetFrame(n, env); auto c = (1 << vi.BitsPerComponent()) - 1; int planes[] = { PLANAR_Y, PLANAR_V, PLANAR_U }; for (int p = 0; p < 3; p++) { auto srcp = src->GetReadPtr(planes[p]); auto dstp = dst->GetWritePtr(planes[p]); auto height = src->GetHeight(planes[p]) * vi.ComponentSize(); auto row_size = src->GetRowSize(planes[p]) / vi.ComponentSize(); auto src_pitch = src->GetPitch(planes[p]) / vi.ComponentSize(); auto dst_pitch = dst->GetPitch(planes[p]) / vi.ComponentSize(); for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = srcp[x] ^ c; } srcp += src_pitch; dstp += dst_pitch; } } return dst; } }; AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env) { return new InvertNeg(args[0].AsClip(), env); } const AVS_Linkage* AVS_linkage = 0; extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; env->AddFunction("InvertNeg", "c", Create_InvertNeg, 0); return "InvertNeg sample plugin"; }
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#15 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
Code:
height = src->GetHeight(planes[p]) * vi.ComponentSize(); With this line loop will run out of buffer very far (and will meet hardware memory protection error when run to the next 4kB memory page typically). It may temporarily work with very small frame sizes but cause memory corruption after actual buffer length. As for 'auto' pointers types - I not sure if compiler really knows how many types do you need to support and may be not compile 'real' 3 different versions of functions. May be it will take only types of functions of unsigned char from AVS headers. If you want to use 'templating' you need to declare template and 3 real functions of types unsigned char, unsigned short and float somewhere. As I remember first we declare 'template function' like https://github.com/DTL2020/mvtools/b...olation.h#L137 in the header. With pixel_t param as our bitdepth. Next make function implementation - https://github.com/DTL2020/mvtools/b...tion.cpp#L2791 using pixel_t as param of data type. It is example of 'universal HBD' C-function for all data types. See how src and dst accessed via pixel_t type: Code:
pixel_t* pctDst = reinterpret_cast<pixel_t*>(pDst); const pixel_t* pSrc; pSrc = reinterpret_cast<const pixel_t*>(_pSrc) ... pctDst[j * nDstPitch + i] = (pixel_t)fOut; In your 'one for all' HBD C-function you can use pixel_t type as conditional assignment of types of variables like https://github.com/DTL2020/mvtools/b...grainN.cpp#L98 And next declare 3 real functions of 3 types to compile - https://github.com/DTL2020/mvtools/b...tion.cpp#L4766 . So compiler will make 3 real copies of function to use. Next at the class constructor you select the required type of function depends on 'pixelsize': https://github.com/DTL2020/mvtools/b...VPlane.cpp#L86 . When AVS construct filtergraph it call class constructors and provide bitdepth to use. So at this point you can select the required version of function to use. And call function by its pointer at processing time: https://github.com/DTL2020/mvtools/b...Plane.cpp#L693 Last edited by DTL; 27th January 2023 at 23:17. |
![]() |
![]() |
![]() |
#16 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
This works with all bits depth
![]() Code:
#include <windows.h> #include <avisynth.h> template<typename pixel_t> void Invert(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits) { pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp); const pixel_t* srcp = reinterpret_cast<const pixel_t*>(_srcp); if (bits == 32) { for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = 1.0f - srcp[x]; } dstp += dst_pitch; srcp += src_pitch; } } else { int MAX = (1 << bits) - 1; for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = MAX - srcp[x]; } dstp += dst_pitch; srcp += src_pitch; } } } template void Invert<uint8_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits); template void Invert<uint16_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits); template void Invert<float>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits); class InvertNeg : public GenericVideoFilter { public: InvertNeg(PClip _child, IScriptEnvironment* env) : GenericVideoFilter(_child) { } PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) { PVideoFrame dst = env->NewVideoFrame(vi); PVideoFrame src = child->GetFrame(n, env); auto srcp = src->GetReadPtr(PLANAR_Y); auto dstp = dst->GetWritePtr(PLANAR_Y); auto height = src->GetHeight(PLANAR_Y); auto row_size = src->GetRowSize(PLANAR_Y) / vi.ComponentSize(); auto src_pitch = src->GetPitch(PLANAR_Y) / vi.ComponentSize(); auto dst_pitch = dst->GetPitch(PLANAR_Y) / vi.ComponentSize(); if (vi.ComponentSize() == 1) { Invert<uint8_t>(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent()); } if (vi.ComponentSize() == 2) { Invert<uint16_t>(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent()); } if (vi.ComponentSize() == 4) { Invert<float>(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent()); } return dst; } }; AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env) { return new InvertNeg(args[0].AsClip(), env); } const AVS_Linkage* AVS_linkage = 0; extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; env->AddFunction("InvertNeg", "c", Create_InvertNeg, 0); return "InvertNeg sample plugin"; }
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
#17 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
This is example of OpenMP internal MT ('threads' user control param with default 1) and SIMD up to AVX512 in float32 processing only (to make example shorter).
Code:
#include <windows.h> #include "include\avisynth.h" #include <immintrin.h> template<typename pixel_t> void Invert(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags) { pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp); pixel_t* srcp = reinterpret_cast<pixel_t*>(_srcp); if (bits == 32) { #pragma omp parallel for num_threads(threads) for (int y = 0; y < height; y++) { pixel_t* l_dstp = dstp + y * dst_pitch; pixel_t* l_srcp = srcp + y * src_pitch; if (cpuFlags & CPUF_AVX512F) // use AVX512 { float* pf_src = (float*)l_srcp; float* pf_dst = (float*)l_dstp; const int col64 = row_size - (row_size % 64); // use 4*16 512bit regs to load/store __m512 zmm_fone = _mm512_set1_ps(1.0f); for (int64_t col = 0; col < col64; col += 64) { __m512 zmm0 = _mm512_loadu_ps(pf_src); // better align start addr with pre-conversion of 32(64?)-bytes aligned (if exist) and use load_ps __m512 zmm1 = _mm512_loadu_ps(pf_src + 16); __m512 zmm2 = _mm512_loadu_ps(pf_src + 32); __m512 zmm3 = _mm512_loadu_ps(pf_src + 48); zmm0 = _mm512_sub_ps(zmm_fone, zmm0); zmm1 = _mm512_sub_ps(zmm_fone, zmm1); zmm2 = _mm512_sub_ps(zmm_fone, zmm2); zmm3 = _mm512_sub_ps(zmm_fone, zmm3); _mm512_storeu_ps(pf_dst, zmm0); _mm512_storeu_ps(pf_dst + 16, zmm1); _mm512_storeu_ps(pf_dst + 32, zmm2); _mm512_storeu_ps(pf_dst + 48, zmm3); pf_src += 64; // in floats pf_dst += 64; } // last cols for (int64_t col = col64; col < row_size; ++col) { *pf_dst = (pixel_t)(1.0f - *pf_src); pf_dst++; pf_src++; } } else if (cpuFlags & CPUF_AVX) // use AVX { float* pf_src = (float*)l_srcp; float* pf_dst = (float*)l_dstp; const int col32 = row_size - (row_size % 32); // use 4*8 256bit regs to load/store __m256 ymm_fone = _mm256_set1_ps(1.0f); for (int64_t col = 0; col < col32; col += 32) { __m256 ymm0 = _mm256_loadu_ps(pf_src); // better align start addr with pre-conversion of 32-bytes aligned (if exist) and use load_ps __m256 ymm1 = _mm256_loadu_ps(pf_src + 8); __m256 ymm2 = _mm256_loadu_ps(pf_src + 16); __m256 ymm3 = _mm256_loadu_ps(pf_src + 24); ymm0 = _mm256_sub_ps(ymm_fone, ymm0); ymm1 = _mm256_sub_ps(ymm_fone, ymm1); ymm2 = _mm256_sub_ps(ymm_fone, ymm2); ymm3 = _mm256_sub_ps(ymm_fone, ymm3); _mm256_storeu_ps(pf_dst, ymm0); _mm256_storeu_ps(pf_dst + 8, ymm1); _mm256_storeu_ps(pf_dst + 16, ymm2); _mm256_storeu_ps(pf_dst + 24, ymm3); pf_src += 32; // in floats pf_dst += 32; } // last cols for (int64_t col = col32; col < row_size; ++col) { *pf_dst = (pixel_t)(1.0f - *pf_src); pf_dst++; pf_src++; } } else if (cpuFlags & CPUF_SSE) // use SSE { float* pf_src = (float*)l_srcp; float* pf_dst = (float*)l_dstp; const int col16 = row_size - (row_size % 16); // use 4*4 128bit regs to load/store __m128 xmm_fone = _mm_set1_ps(1.0f); for (int64_t col = 0; col < col16; col += 16) { __m128 xmm0 = _mm_loadu_ps(pf_src); // better align start addr with pre-conversion of 16-bytes aligned (if exist) and use load_ps __m128 xmm1 = _mm_loadu_ps(pf_src + 4); __m128 xmm2 = _mm_loadu_ps(pf_src + 8); __m128 xmm3 = _mm_loadu_ps(pf_src + 12); xmm0 = _mm_sub_ps(xmm_fone, xmm0); xmm1 = _mm_sub_ps(xmm_fone, xmm1); xmm2 = _mm_sub_ps(xmm_fone, xmm2); xmm3 = _mm_sub_ps(xmm_fone, xmm3); _mm_storeu_ps(pf_dst, xmm0); _mm_storeu_ps(pf_dst + 4, xmm1); _mm_storeu_ps(pf_dst + 8, xmm2); _mm_storeu_ps(pf_dst + 12, xmm3); pf_src += 16; // in floats pf_dst += 16; } // last cols for (int64_t col = col16; col < row_size; ++col) { *pf_dst = (pixel_t)(1.0f - *pf_src); pf_dst++; pf_src++; } } else // C-reference for (int x = 0; x < row_size; x++) { l_dstp[x] = (pixel_t)(1.0f - l_srcp[x]); } } } else { int MAX = (1 << bits) - 1; for (int y = 0; y < height; y++) { for (int x = 0; x < row_size; x++) { dstp[x] = MAX - srcp[x]; } dstp += dst_pitch; srcp += src_pitch; } } } template void Invert<uint8_t>(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags); template void Invert<uint16_t>(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags); template void Invert<float>(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags); class InvertNeg : public GenericVideoFilter { int threads; int _cpuFlags; public: InvertNeg(PClip _child, int threads_, IScriptEnvironment* env) : GenericVideoFilter(_child), threads(threads_) { _cpuFlags = env->GetCPUFlags(); } PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env) { PVideoFrame dst = env->NewVideoFrame(vi); PVideoFrame src = child->GetFrame(n, env); auto srcp = src->GetReadPtr(PLANAR_Y); auto dstp = dst->GetWritePtr(PLANAR_Y); auto height = src->GetHeight(PLANAR_Y); auto row_size = src->GetRowSize(PLANAR_Y) / vi.ComponentSize(); auto src_pitch = src->GetPitch(PLANAR_Y) / vi.ComponentSize(); auto dst_pitch = dst->GetPitch(PLANAR_Y) / vi.ComponentSize(); if (vi.ComponentSize() == 1) { Invert<uint8_t>((uint8_t*)srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads, _cpuFlags); } if (vi.ComponentSize() == 2) { Invert<uint16_t>((uint8_t*)srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads, _cpuFlags); } if (vi.ComponentSize() == 4) { Invert<float>((uint8_t*)srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads, _cpuFlags); } return dst; } }; AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env) { return new InvertNeg(args[0].AsClip(), args[1].AsInt(1), env); } const AVS_Linkage* AVS_linkage = 0; extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; env->AddFunction("InvertNeg", "c[threads]i", Create_InvertNeg, 0); return "InvertNeg sample plugin"; } It process Y-only format, no UV planes copy. The 'one for all' processing function quickly become very complex to support different bitdepth/SIMDfamily so it is better to make separate functions and apply selector at the plugin init to function startup addr. Edit: correct Y8 to Y-only format. Last edited by DTL; 28th January 2023 at 19:43. |
![]() |
![]() |
![]() |
#18 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
Inductive logic is not demostrative, but since I'm ignorant (
![]() Code:
template<typename pixel_t> void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads) { pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp); const pixel_t* srcp = reinterpret_cast<const pixel_t*>(_srcp); if (bits == 32) { #pragma omp parallel for num_threads(threads) for (auto y = 0; y < height; y++) { float* line_dstp = dstp + y * dst_pitch; float* line_srcp = srcp + y * src_pitch; float* src_float_ptr = (float*)line_srcp; float* dst_float_ptr = (float*)line_dstp; __m512 vector_max_512 = _mm512_set1_ps(1.0f); auto row_size_mod64 = row_size - (row_size % 64); for (auto column = 0; column < row_size_mod64; column += 64) { __m512 vector_src_00 = _mm512_loadu_ps(src_float_ptr); __m512 vector_src_16 = _mm512_loadu_ps(src_float_ptr + 16); __m512 vector_src_32 = _mm512_loadu_ps(src_float_ptr + 32); __m512 vector_src_48 = _mm512_loadu_ps(src_float_ptr + 48); vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00); vector_src_16 = _mm512_sub_ps(vector_max_512, vector_src_16); vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32); vector_src_48 = _mm512_sub_ps(vector_max_512, vector_src_48); _mm512_storeu_ps(dst_float_ptr , vector_src_00); _mm512_storeu_ps(dst_float_ptr + 16, vector_src_16); _mm512_storeu_ps(dst_float_ptr + 32, vector_src_32); _mm512_storeu_ps(dst_float_ptr + 48, vector_src_48); src_float_ptr += 64; dst_float_ptr += 64; } for (auto column = row_size_mod64; column < row_size; column++) { *dst_float_ptr = (float)(1.0f - *src_float_ptr); dst_float_ptr++; src_float_ptr++; } } } else if(bits == 16 || bits == 14 || bits == 12 || bits == 10) { auto MAX = (1 << bits) - 1; #pragma omp parallel for num_threads(threads) for (auto y = 0; y < height; y++) { uint16_t* line_dstp = dstp + y * dst_pitch; uint16_t* line_srcp = srcp + y * src_pitch; uint16_t* src_uint16_t_ptr = (uint16_t*)line_srcp; uint16_t* dst_uint16_t_ptr = (uint16_t*)line_dstp; __m512 vector_max_512 = _mm512_set1_ps(MAX); auto row_size_mod128 = row_size - (row_size % 128); for (auto column = 0; column < row_size_mod128; column += 128) { __m512 vector_src_00 = _mm512_loadu_ps(src_uint16_t_ptr); __m512 vector_src_32 = _mm512_loadu_ps(src_uint16_t_ptr + 32); __m512 vector_src_64 = _mm512_loadu_ps(src_uint16_t_ptr + 64); __m512 vector_src_96 = _mm512_loadu_ps(src_uint16_t_ptr + 96); vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00); vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32); vector_src_64 = _mm512_sub_ps(vector_max_512, vector_src_64); vector_src_96 = _mm512_sub_ps(vector_max_512, vector_src_96); _mm512_storeu_ps(dst_uint16_t_ptr , vector_src_00); _mm512_storeu_ps(dst_uint16_t_ptr + 32, vector_src_32); _mm512_storeu_ps(dst_uint16_t_ptr + 64, vector_src_64); _mm512_storeu_ps(dst_uint16_t_ptr + 96, vector_src_96); src_uint16_t_ptr += 128; dst_uint16_t_ptr += 128; } for (auto column = row_size_mod128; column < row_size; column++) { *dst_uint16_t_ptr = (uint16_t)(*src_uint16_t_ptr ^ MAX); dst_uint16_t_ptr++; src_uint16_t_ptr++; } } } else { #pragma omp parallel for num_threads(threads) for (auto y = 0; y < height; y++) { uint8_t* line_dstp = dstp + y * dst_pitch; uint8_t* line_srcp = srcp + y * src_pitch; uint8_t* src_uint8_t_ptr = (uint8_t*)line_srcp; uint8_t* dst_uint8_t_ptr = (uint8_t*)line_dstp; __m512 vector_max_512 = _mm512_set1_ps(255); auto row_size_mod256 = row_size - (row_size % 256); for (auto column = 0; column < row_size_mod128; column += 256) { __m512 vector_src_000 = _mm512_loadu_ps(src_uint8_t_ptr); __m512 vector_src_064 = _mm512_loadu_ps(src_uint8_t_ptr + 64); __m512 vector_src_128 = _mm512_loadu_ps(src_uint8_t_ptr + 128); __m512 vector_src_192 = _mm512_loadu_ps(src_uint8_t_ptr + 192); vector_src_000 = _mm512_sub_ps(vector_max_512, vector_src_000); vector_src_064 = _mm512_sub_ps(vector_max_512, vector_src_064); vector_src_128 = _mm512_sub_ps(vector_max_512, vector_src_128); vector_src_192 = _mm512_sub_ps(vector_max_512, vector_src_192); _mm512_storeu_ps(dst_uint8_t_ptr , vector_src_000); _mm512_storeu_ps(dst_uint8_t_ptr + 64 , vector_src_064); _mm512_storeu_ps(dst_uint8_t_ptr + 128, vector_src_128); _mm512_storeu_ps(dst_uint8_t_ptr + 192, vector_src_192); src_uint8_t_ptr += 256; dst_uint8_t_ptr += 256; } for (auto column = row_size_mod256; column < row_size; column++) { *dst_uint8_t_ptr = (uint8_t)(*src_uint8_t_ptr ^ 255); dst_uint8_t_ptr++; src_uint8_t_ptr++; } } } } template void Invert_AVX512<uint8_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads); template void Invert_AVX512<uint16_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads); template void Invert_AVX512<float>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads); Code:
template<typename pixel_t> void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads) { pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp); const pixel_t* srcp = reinterpret_cast<const pixel_t*>(_srcp); if (bits == 32) { #pragma omp parallel for num_threads(threads) for (auto y = 0; y < height; y++) { float* line_dstp = dstp + y * dst_pitch; float* line_srcp = srcp + y * src_pitch; float* src_float_ptr = (float*)line_srcp; float* dst_float_ptr = (float*)line_dstp; __m512 vector_max_512 = _mm512_set1_ps(1.0f); auto row_size_mod64 = row_size - (row_size % 64); for (auto column = 0; column < row_size_mod64; column += 64) { __m512 vector_src_00 = _mm512_loadu_ps(src_float_ptr); __m512 vector_src_16 = _mm512_loadu_ps(src_float_ptr + 16); __m512 vector_src_32 = _mm512_loadu_ps(src_float_ptr + 32); __m512 vector_src_48 = _mm512_loadu_ps(src_float_ptr + 48); vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00); vector_src_16 = _mm512_sub_ps(vector_max_512, vector_src_16); vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32); vector_src_48 = _mm512_sub_ps(vector_max_512, vector_src_48); _mm512_storeu_ps(dst_float_ptr , vector_src_00); _mm512_storeu_ps(dst_float_ptr + 16, vector_src_16); _mm512_storeu_ps(dst_float_ptr + 32, vector_src_32); _mm512_storeu_ps(dst_float_ptr + 48, vector_src_48); src_float_ptr += 64; dst_float_ptr += 64; } for (auto column = row_size_mod64; column < row_size; column++) { *dst_float_ptr = (float)(1.0f - *src_float_ptr); dst_float_ptr++; src_float_ptr++; } } } else if(bits == 16 || bits == 14 || bits == 12 || bits == 10) { uint16_t MAX = (1 << bits) - 1; #pragma omp parallel for num_threads(threads) for (auto y = 0; y < height; y++) { uint16_t* line_dstp = dstp + y * dst_pitch; uint16_t* line_srcp = srcp + y * src_pitch; uint16_t* src_uint16_t_ptr = (uint16_t*)line_srcp; uint16_t* dst_uint16_t_ptr = (uint16_t*)line_dstp; __m512 vector_max_512 = _mm512_set1_ps(MAX); auto row_size_mod64 = row_size - (row_size % 64); for (auto column = 0; column < row_size_mod64; column += 64) { __m512 vector_src_00 = _mm512_loadu_ps(src_uint16_t_ptr); __m512 vector_src_32 = _mm512_loadu_ps(src_uint16_t_ptr + 32); vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00); vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32); _mm512_storeu_ps(dst_uint16_t_ptr , vector_src_00); _mm512_storeu_ps(dst_uint16_t_ptr + 32, vector_src_32); src_uint16_t_ptr += 64; dst_uint16_t_ptr += 64; } for (auto column = row_size_mod64; column < row_size; column++) { *dst_uint16_t_ptr = (uint16_t)(*src_uint16_t_ptr ^ MAX); dst_uint16_t_ptr++; src_uint16_t_ptr++; } } } else { #pragma omp parallel for num_threads(threads) for (auto y = 0; y < height; y++) { uint8_t* line_dstp = dstp + y * dst_pitch; uint8_t* line_srcp = srcp + y * src_pitch; uint8_t* src_uint8_t_ptr = (uint8_t*)line_srcp; uint8_t* dst_uint8_t_ptr = (uint8_t*)line_dstp; __m512 vector_max_512 = _mm512_set1_ps(255); auto row_size_mod64 = row_size - (row_size % 64); for (auto column = 0; column < row_size_mod64; column += 64) { __m512 vector_src_00 = _mm512_loadu_ps(src_uint8_t_ptr); vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00); _mm512_storeu_ps(dst_uint8_t_ptr , vector_src_00); src_uint8_t_ptr += 64; dst_uint8_t_ptr += 64; } for (auto column = row_size_mod64; column < row_size; column++) { *dst_uint8_t_ptr = (uint8_t)(*src_uint8_t_ptr ^ 255); dst_uint8_t_ptr++; src_uint8_t_ptr++; } } } } template void Invert_AVX512<uint8_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads); template void Invert_AVX512<uint16_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads); template void Invert_AVX512<float>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. Last edited by Ceppo; 28th January 2023 at 19:01. |
![]() |
![]() |
![]() |
#19 | Link |
Registered User
Join Date: Jul 2018
Posts: 989
|
Code:
float* line_dstp = dstp + y * dst_pitch; float* line_srcp = srcp + y * src_pitch; float* src_float_ptr = (float*)line_srcp; float* dst_float_ptr = (float*)line_dstp; Code:
float* local_dstp = (float*)(dstp + y * dst_pitch); float* local_srcp = (float*)(srcp + y * src_pitch); For Code:
else if(bits == 16 || bits == 14 || bits == 12 || bits == 10) For 16bit integer AVX512 SIMD op you need to use : __m512i type, _mm512_loadu_si512() load 512bit unaligned for load, _mm512_subs_epu16() for 16bit subtraction _mm512_storeu_si512() for unaligned store of 512bit dataword _mm512_set1_epi16() for setting 16bit unsigned short MAX value to all 16bit positions of 512bit dataword. For 8bit: _mm512_set1_epi8() and _mm512_subs_epu8() with same integer 512bit type and load/store. Also columns loaded to one 512bit dataword is 512/16=32 for 10..16bit and 512/8=64. So even 4x512bits storage process 128..256 columns per pass. The residual columns for scalar proc may be too large with such big main SIMD loops so may be good to make some immediate ending process with some like 512 or 256bit single dataword after massive main proc before last columns with scalar operation. So each row processing will be even 3 stages: 1. Main SIMD processing with large load and store to registerfile (hundreds or even thousands columns per loop pass, with 30 AVX512 'registers' and 64 8bit columns per 'register' load you can load 30*64=1920 columns at single load and store op). 2. Some medium size SIMD post-processing of dozens columns like 1 of 512 or lower register per loop pass. Some residual columns that not as much as required to fit in main 1. loop. 3. Last columns in C-only scalar processing. It looks I quickly fix broken VisualStudio 2019 installation at my work place with uninstalling SDKs/DDKs after lots of driver development kits installation and will try to make 8 and 10..16 bit example too in a few hours later. " I did the AVX512 template" Now when you make separate functions for SIMD architectures it may be no need to make it 'templated' because you have 'bits' param passed and know conversion of bits-values to pointers types. And different bits processing is completely separated inside each SIMD-function. You may left 'templating' only for C-reference/only part of program. So you can make Code:
void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads) { if (bits == 32) { float* dstp = reinterpret_cast<float*>(_dstp); float* srcp = reinterpret_cast<float*>(_srcp); #pragma omp parallel for num_threads(threads) <make 32bit load and proc and store> } else if(bits == 16 || bits == 14 || bits == 12 || bits == 10) { uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp); uint16_t* srcp = reinterpret_cast<uint16_t*>(_srcp); #pragma omp parallel for num_threads(threads) <make 16bit load and proc and store> else { uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp); uint8_t* srcp = reinterpret_cast<uint8_t*>(_srcp); #pragma omp parallel for num_threads(threads) <make 8bit load and proc and store> } } Last edited by DTL; 28th January 2023 at 21:15. |
![]() |
![]() |
![]() |
#20 | Link |
Registered User
Join Date: Feb 2016
Location: Nonsense land
Posts: 336
|
Thanks for the tips;
I noticed here https://www.laruence.com/sse/# that for AVX and SSE, some functions have not a _m256i, _m128i one, how do you handle 16bit and 8 bit cases?
__________________
CQTGMC/CTools I come from nonsense land. I usually post under the effect of alchool and I don't think before writing, so don't get it personal, I didn't mean to. |
![]() |
![]() |
![]() |
Thread Tools | Search this Thread |
Display Modes | |
|
|