HBD/OPTIMIZATION Request :) [Archive]

Ceppo

25th January 2023, 22:45

I did this simple sharpening filter who does a 3x3 approximated gaussian blur and adds back the difference, reducing it by heuristic means to not get halo.

Now my request is: can someone HBD this simple plugin and give me some tips on how to make it faster?

#pragma once
#include <cmath>
#include <windows.h>
#include <avisynth.h>
using namespace std;

struct CSharpenFilter
{
PVideoFrame dst;
PVideoFrame src;
int height;
int row_size;
int src_pitch;
int dst_pitch;
unsigned char* dstp;
const unsigned char* srcc;
const unsigned char* srcp;
const unsigned char* srcn;

void GetFrame(PClip child, int n, VideoInfo vi, IScriptEnvironment* env)
{
src = child->GetFrame(n,env);
dst = env->NewVideoFrame(vi);
}
void GetPlaneY()
{
dstp = dst->GetWritePtr(PLANAR_Y);
dst_pitch = src->GetPitch(PLANAR_Y);

srcc = src->GetReadPtr(PLANAR_Y);
srcp = src->GetReadPtr(PLANAR_Y);
srcn = src->GetReadPtr(PLANAR_Y);
src_pitch = src->GetPitch(PLANAR_Y);
height = src->GetHeight(PLANAR_Y);
row_size = src->GetRowSize(PLANAR_Y);
}
void GetPlaneU()
{
dstp = dst->GetWritePtr(PLANAR_U);
dst_pitch = src->GetPitch(PLANAR_U);

srcc = src->GetReadPtr(PLANAR_U);
srcp = src->GetReadPtr(PLANAR_U);
srcn = src->GetReadPtr(PLANAR_U);
src_pitch = src->GetPitch(PLANAR_U);
height = src->GetHeight(PLANAR_U);
row_size = src->GetRowSize(PLANAR_U);
}
void GetPlaneV()
{
dstp = dst->GetWritePtr(PLANAR_V);
dst_pitch = src->GetPitch(PLANAR_V);

srcc = src->GetReadPtr(PLANAR_V);
srcp = src->GetReadPtr(PLANAR_V);
srcn = src->GetReadPtr(PLANAR_V);
src_pitch = src->GetPitch(PLANAR_V);
height = src->GetHeight(PLANAR_V);
row_size = src->GetRowSize(PLANAR_V);
}
void CopyPlane()
{
int x, y;

for (y = 0; y < height; y++)
{
for (x = 0; x < row_size; x++)
{
dstp[x] = srcc[x];
}
dstp += dst_pitch;
srcc += src_pitch;
}
}
//CORE FILTER SUPPORT FUNCTIONS
int AddDiff(int x, int y, int nt, int mode)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
i = j < nt ? 0 : i;
j = j < nt ? 0 : j;

//REDUCE DIFFERENCE;
i = mode > 0 ? (int)sqrt(j) : i;
i = mode > 1 ? (int)atan(j) * i : i;

//ADD DIFFERENCE
i = i * k;
i = min(255, x + i);
i = max( 0, i);

return i;
}
void CoreFilter(int nt, int mode)
{
int x, y, i, j, k;

//START PITCH;
srcp -= src_pitch;
srcn += src_pitch;

//FILTER FIRST ROW;
for (x = 0; x < row_size; x++)
{
j = max(0, x - 1);
k = min(row_size - 1, x + 1);

i = srcc[j] + srcc[x] * 2 + srcc[k];
i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2;
i += srcn[j] + srcn[x] * 2 + srcn[k];
i = (int)(i / 16.0f + 0.5f);
dstp[x] = AddDiff(srcc[x], i, nt, mode);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;

//FILTER MAIN ROWS;
for (y = 1; y < height - 1; y++)
{
for (x = 0; x < row_size; x++)
{
j = max(0, x - 1);
k = min(row_size - 1, x + 1);

i = srcp[j] + srcp[x] * 2 + srcp[k];
i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2;
i += srcn[j] + srcn[x] * 2 + srcn[k];
i = (int)(i / 16.0f + 0.5f);
dstp[x] = AddDiff(srcc[x], i, nt, mode);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}

//FILTER LAST ROW;
for (x = 0; x < row_size; x++)
{
j = max(0, x - 1);
k = min(row_size - 1, x + 1);

i = srcp[j] + srcp[x] * 2 + srcp[k];
i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2;
i += srcc[j] + srcc[x] * 2 + srcc[k];
i = (int)(i / 16.0f + 0.5f);
dstp[x] = AddDiff(srcc[x], i, nt, mode);
}
}
};

class CSharpen : public GenericVideoFilter
{
int nt, mode;
bool Y, U, V;
public:
CSharpen(PClip _child, int _nt, int _mode, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), nt(_nt), mode(_mode), Y(_Y), U(_U), V(_V)
{
if (!vi.IsYUV())
{
env->ThrowError("CSharpen: supported colorspaces are Y8, YV12, YV16, YV24!");
}
else if (nt < 0 || nt > 255)
{
env->ThrowError("CSharpen: nt avaible range is [0, 255]!");
}
else if (mode < 0 || mode > 2)
{
env->ThrowError("CSharpen: mode avaible mode values are 0, 1, 2!");
}
}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{

CSharpenFilter Frame;
Frame.GetFrame(child, n, vi, env);

if (Y) Frame.GetPlaneY();
if (Y) Frame.CoreFilter(nt, mode);

if (!vi.IsY8())
{
Frame.GetPlaneU();
U ? Frame.CoreFilter(nt, mode) : Frame.CopyPlane();
Frame.GetPlaneV();
V ? Frame.CoreFilter(nt, mode) : Frame.CopyPlane();
}

return Frame.dst;
}
};

const AVS_Linkage* AVS_linkage = 0;

AVSValue __cdecl Create_CSharpen(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new CSharpen(args[0].AsClip(),args[1].AsInt(3),args[2].AsInt(2),args[3].AsBool(true),args[4].AsBool(false),args[5].AsBool(false),env);
}

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("CSharpen", "c[nt]i[mode]i[Y]b[U]b[V]b", Create_CSharpen, 0);
return "CSharpen";
}

kedautinh12

26th January 2023, 03:17

Ideas from Asd-g
https://github.com/Asd-g/AviSynthPlus-Scripts/issues/15#issuecomment-1404461580

DTL

26th January 2023, 12:21

To make it faster onCPU you need to use SIMD co-processor.

So in the plugin init you put some selector of the main processing function for each combination of input params and SIMD co-processor available. Like bitdepth/SIMDfamily/mode/etc. And call selected function as processing each frame at GetFrame(). Example of selector is https://github.com/pinterf/mvtools/blob/d8bdff7e02c15a28dcc6e9ef2ebeaa9d16cc1f56/Sources/PlaneOfBlocks.cpp#L3756

First you put to SIMD the 3x3 blocks convolution:

i = srcp[j] + srcp[x] * 2 + srcp[k];
i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2;
i += srcn[j] + srcn[x] * 2 + srcn[k];
i = (int)(i / 16.0f + 0.5f);

To use SIMD more effectively you process several blocks in single pass so you modify external loop to:

for (x = 0; x < row_size; x+=iBlocksPerPass)

And process end non-integer x/iBlocksPerPass blocks in ending addition (in SIMD or simple C as today).

3x3 blocks with +1 x stepping (x++ in original program) are overlapping so it is better to use less SIMD register file loads from cache to process several horizontal blocks in single SIMD pass to save time. Also the coefficients for convolution are constants for all blocks. Number of sequential blocks to process depends on the 'width' of dataword for SIMD coprocessor used so different for each SSE2/AVX2/AVX512 functions versions. Also with 'large' architectures like AVX512 with 32 directly addressed by instructions datawords (of 512bits each) available it may be possible to process several sets of blocks at single pass. So iBlocksPerPass for AVX512 may be much larger in compare with possibly 2x difference between SSE2 and AVX2 versions.

At the output of SIMD blocks convolution you get vector of i[n] elements from each block and better to replace AddDiff(srcc[x], i, nt, mode); function to SIMD input and output (final saving) too.

The AddDiff() is much more complex for SIMD co-processor so the only help is approximate SQRT instruction available. No atan(). So mode=atan() will mostly probably will be C-only and much slower. Because saving values depend on bitdepth it is better to put storing to memory inside function so it better to be AddDiffAndStoreResultVector() or even Process_i_vector(). Also the virtual vector or 'return' values of SIMD blocks convolution will be 'real SIMD register' so it is good to check if compiler really use 'register' transfer and not cache store-load or better not use AddDiff call at all and inline it manually in the main SIMD program. Inlining may require to make more versions of separated functions for main selector.
So to support 8/16/32 bitdepth and SSE2/AVX2/AVX512(F) it is required to make at least 3x3=9 versions of main processing to select. If adding 'mode' of 3 - it is 3x3x3 functions. Or 3x3x2 if making condition

i = mode > 0 ? (int)sqrt(j) : i;

inside function (a bit slower).
Your current program loaded with lots of conditional branching - it make program text much smaller but chip execution slower. If possible all non-realtime conditional branching with some rare changing params or even single set params for all frame pass better to make separate program blocks. It make program text much larger but helps to execution speed. Though with compiler analyser like godbolt you can try to check if some wise enough compiler can use runtime conditional instructions for simple enough tasks like min()/max() instead of conditional jumps.

Ceppo

26th January 2023, 17:53

Thanks DTL! I will update with your tips in the evening! I will post the update tomorrow :)

DTL

26th January 2023, 18:05

Also for integer samples

i = (int)(i / 16.0f + 0.5f);

may be replaced with integer shift without converting to float (and slow float division that any good compiler will replace with multiplication to float constant I hope) and back:

i = i >> 4;

Same processing in the SIMD functions in the future - make logical shift 4 bit to the right (if using unsigned ints or arithmetic sign-extending shift to the right if value can be negative). If you make sort of gaussian blur - the sum looks like always be positive so you can use unsigned integers (and have +1 bit to keep from low bits overflow in dense SIMD calculations).
For 8bit unsigned samples up to 255

i = srcp[j] + srcp[x] * 2 + srcp[k];
i += srcc[j] * 2 + srcc[x] * 4 + srcc[k] * 2;
i += srcn[j] + srcn[x] * 2 + srcn[k];

sum looks like may have possible max of (1+2+1+2+4+2+1+2+1)*255=4080 so you can use 16bit unsigned integers as accumulator for the total block convolution in SIMD processing.

Float calculation you will need only in the float32 samples function. Also +0.5f for better rounding not needed for floats.
For float32 samples version it is much better to use something like

const float my1div16=1.0f / 16.0f;
i *= my1div16;

or may be better

i *= 0.0625f; // 1.0f / 16.0f;

Or pray to gods of C compiler to do same thing for you in the optimized release build if you write direct division to float constant like i /= 16.0f.

Ceppo

26th January 2023, 21:36

Almost 3x speed:

#pragma once
#include <windows.h>
#include <avisynth.h>
using namespace std;

struct CSharpenFilter
{
VideoInfo vi;
PVideoFrame dst;
PVideoFrame src;
int height;
int row_size;
int src_pitch;
int dst_pitch;
unsigned char* dstp;
const unsigned char* srcc;
const unsigned char* srcp;
const unsigned char* srcn;

void GetFrame(PClip child, int n, VideoInfo info, IScriptEnvironment* env)
{
vi = info;
src = child->GetFrame(n,env);
dst = env->NewVideoFrame(vi);
}
void GetPlaneY()
{
dstp = dst->GetWritePtr(PLANAR_Y);
dst_pitch = src->GetPitch(PLANAR_Y);

srcc = src->GetReadPtr(PLANAR_Y);
srcp = src->GetReadPtr(PLANAR_Y);
srcn = src->GetReadPtr(PLANAR_Y);
src_pitch = src->GetPitch(PLANAR_Y);
height = src->GetHeight(PLANAR_Y);
row_size = src->GetRowSize(PLANAR_Y);
}
void GetPlaneU()
{
dstp = dst->GetWritePtr(PLANAR_U);
dst_pitch = src->GetPitch(PLANAR_U);

srcc = src->GetReadPtr(PLANAR_U);
srcp = src->GetReadPtr(PLANAR_U);
srcn = src->GetReadPtr(PLANAR_U);
src_pitch = src->GetPitch(PLANAR_U);
height = src->GetHeight(PLANAR_U);
row_size = src->GetRowSize(PLANAR_U);
}
void GetPlaneV()
{
dstp = dst->GetWritePtr(PLANAR_V);
dst_pitch = src->GetPitch(PLANAR_V);

srcc = src->GetReadPtr(PLANAR_V);
srcp = src->GetReadPtr(PLANAR_V);
srcn = src->GetReadPtr(PLANAR_V);
src_pitch = src->GetPitch(PLANAR_V);
height = src->GetHeight(PLANAR_V);
row_size = src->GetRowSize(PLANAR_V);
}
void CopyPlane()
{
memcpy(dstp, srcp, row_size * height);
}
//CORE FILTER FUNCTIONS;
int Clamp(int x)
{
return max(0, min(255, x));
}
int fast_sqrt(const int n)
{
for (int j = 2;;j++)
{
if (j * j > n) return j - 1;
}
}
int fast_atan(const float n)
{
return 1.57 - 1 / n;
}
int AddDiff0(int x, int y, int nt, int str)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
j = j < nt ? 0 : j;
if (j == 0) return x;

//BOOST DIFFERENCE AND CHANGE SIGN;
j = j * str * k;

//CLAMP DIFFERENCE
j = Clamp(x + j);

return j;
}
int AddDiff1(int x, int y, int nt, int str)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
j = j < nt ? 0 : j;
if (j == 0) return x;

//BOOST DIFFERENCE;
j = j * str;

//REDUCE DIFFERENCE;
j = fast_sqrt(j);

//CHANGE SIGN;
j = j * k;

//CLAMP DIFFERENCE
j = Clamp(x + j);

return j;
}
int AddDiff2(int x, int y, int nt, int str)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
j = j < nt ? 0 : j;
if (j == 0) return x;

//BOOST DIFFERENCE;
j = j * str;

//REDUCE DIFFERENCE;
j = int(fast_sqrt(j) * fast_atan(j));

//CHANGE SIGN;
j = j * k;

//CLAMP DIFFERENCE
j = Clamp(x + j);

return j;
}
void BoxBlur()
{
int x, y, i;

//START PITCH;
srcp -= src_pitch;
srcn += src_pitch;

//COPY FIRST ROW;
for (x = 0; x < row_size; x++)
{
dstp[x] = srcc[x];
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;

//FILTER MAIN ROWS;
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] + srcp[x + 1];
i += srcc[x - 1] + srcc[x] + srcc[x + 1];
i += srcn[x - 1] + srcn[x] + srcn[x + 1];
dstp[x] = int(i * 0.1111f + 0.5f);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}

//COPY LAST ROW;
for (x = 0; x < row_size; x++)
{
dstp[x] = srcc[x];
}
dstp -= dst_pitch * (height - 1);
srcp -= src_pitch * (height - 1);
srcc -= src_pitch * (height - 1);
srcn -= src_pitch * (height - 1);
}
void GaussBlur()
{
int x, y, i;

//START PITCH;
srcp -= src_pitch;
srcn += src_pitch;

//COPY FIRST ROW;
for (x = 0; x < row_size; x++)
{
dstp[x] = srcc[x];
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;

//FILTER MAIN ROWS;
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1];
i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2;
i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1];
dstp[x] = int(i * 0.0625f + 0.5f);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}

//COPY LAST ROW;
for (x = 0; x < row_size; x++)
{
dstp[x] = srcc[x];
}
dstp -= dst_pitch * (height - 1);
srcp -= src_pitch * (height - 1);
srcc -= src_pitch * (height - 1);
srcn -= src_pitch * (height - 1);
}
void CoreFilter(int nt, int str, int smode, int bmode)
{
bmode ? GaussBlur() : BoxBlur();

if (smode == 0)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = AddDiff0(srcc[x], dstp[x], nt, str);
}
dstp += dst_pitch;
srcc += src_pitch;
}
}
else if (smode == 1)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = AddDiff1(srcc[x], dstp[x], nt, str);
}
dstp += dst_pitch;
srcc += src_pitch;
}
}
else if (smode == 2)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = AddDiff2(srcc[x], dstp[x], nt, str);
}
dstp += dst_pitch;
srcc += src_pitch;
}
}
}
};

class CSharpen : public GenericVideoFilter
{
int str, smode, bmode, nt;
bool Y, U, V;
public:
CSharpen(PClip _child, int _str, int _smode, int _bmode, int _nt, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), str(_str), nt(_nt), smode(_smode), bmode(_bmode), Y(_Y), U(_U), V(_V)
{
if (!vi.IsYUV())
{
env->ThrowError("CSharpen: supported colorspaces are Y8, YV12, YV16, YV24!");
}
else if (nt < 0 || nt > 255)
{
env->ThrowError("CSharpen: nt avaible range is [0, 255]!");
}
else if (str < 1 || str > 255)
{
env->ThrowError("CSharpen: str avaible range is [1, 255]!");
}
else if (smode < 0 || smode > 2)
{
env->ThrowError("CSharpen: mode avaible mode values are 0, 1, 2!");
}
else if (bmode < 0 || bmode > 1)
{
env->ThrowError("CSharpen: mode avaible mode values are 0, 1!");
}
}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{

CSharpenFilter Frame;
Frame.GetFrame(child, n, vi, env);

Frame.GetPlaneY();
Y ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane();

if (!vi.IsY8())
{
Frame.GetPlaneU();
U ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane();
Frame.GetPlaneV();
V ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane();
}

return Frame.dst;
}
};

const AVS_Linkage* AVS_linkage = 0;

AVSValue __cdecl Create_CSharpen(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new CSharpen(args[0].AsClip(),args[1].AsInt(10),args[2].AsInt(2),args[3].AsInt(1),args[4].AsInt(3),args[5].AsBool(true),args[6].AsBool(false),args[7].AsBool(false),env);
}

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("CSharpen", "c[str]i[smode]i[bmode]i[nt]i[Y]b[U]b[V]b", Create_CSharpen, 0);
return "CSharpen";
}

However, I can't include <micvec.h>, where can I get it? Also, DTL, if you see something off in this pure C++ code, please say so, it is very appreciated.

DTL

26th January 2023, 23:48

In

dstp[x] = int(i * 0.0625f + 0.5f);

your 'i' variable declared as int, so to make division to 16 integer you can use

dstp[x] = i >> 4;

to avoid slow conversion integer to float and back with rounding.

You will need i * 0.0625f only for float32 version of functions when you will add HBD of float32 samples. And your 'i' accumulator will be of 'float' type.

" if you see something off in this pure C++ code"

I really not like idea of 2 pass large memory read/write with separated 'blur' pass and 'adddiff' pass. It will work about good until all your saved 'blur' data in dstp[] buffer still fit into CPU cache, but may significantly drop speed when your frame size will be large so dstp[] buff will be (mostly) trashed from cache to very slow main RAM. The previous version with single pass processing of blur+adddiff in single pass may work better with large frame sizes and at CPUs with small enouth caches.

For optimization tips it is useful to read very big but still useful intel document named "Intel® 64 and IA-32 Architectures Optimization Reference Manual'. Try to found most new version for new architectures.
May be https://www.intel.com/content/www/us/en/developer/articles/technical/intel-sdm.html and https://cdrdv2.intel.com/v1/dl/getContent/671488

Also depending on your C compiler it may be useful to add forced inline directives for small helper functions like Clamp(), fast_sqrt() and others with frequent calling. See documentation on your compiler for nominal forced inline directives. It may be something like __inline__ before function declaration.

For some set of compilers you can try to use MV_FORCEINLINE macro from mvtools:

#ifndef MV_FORCEINLINE
#if defined(__clang__)
// Check clang first. clang-cl also defines __MSC_VER
// We set MSVC because they are mostly compatible
# define CLANG
#if defined(_MSC_VER)
# define MSVC
# define MV_FORCEINLINE __attribute__((always_inline)) inline
#else
# define MV_FORCEINLINE __attribute__((always_inline)) inline
#endif
#elif defined(_MSC_VER)
# define MSVC
# define MSVC_PURE
# define MV_FORCEINLINE __forceinline
#elif defined(__GNUC__)
# define GCC
# define MV_FORCEINLINE __attribute__((always_inline)) inline
#else
# error Unsupported compiler.
# define MV_FORCEINLINE inline
# undef __forceinline
# define __forceinline inline
#endif

#endif

Ceppo

27th January 2023, 04:50

Visual studio has alreade a __forceinline command as far as microsoft says, so I guess I don't need the macro. Also, visual studio has a #pragma omp simd who gave me a small speed up. I suppose is better if you do it yourself, but better than nothing.
ATM on a bluray:
ffvideosource only about 66fps
CSharpen about 57fps, so it's not going so bad (I guess :confused:)

Here the code:

#pragma once
#include <windows.h>
#include <avisynth.h>
using namespace std;

struct CSharpenFilter
{
VideoInfo vi;
PVideoFrame dst;
PVideoFrame src;
int height;
int row_size;
int src_pitch;
int dst_pitch;
unsigned char* dstp;
const unsigned char* srcc;
const unsigned char* srcp;
const unsigned char* srcn;

void GetFrame(PClip child, int n, VideoInfo info, IScriptEnvironment* env)
{
vi = info;
src = child->GetFrame(n,env);
dst = env->NewVideoFrame(vi);
}
void GetPlaneY()
{
dstp = dst->GetWritePtr(PLANAR_Y);
dst_pitch = src->GetPitch(PLANAR_Y);

srcc = src->GetReadPtr(PLANAR_Y);
srcp = src->GetReadPtr(PLANAR_Y);
srcn = src->GetReadPtr(PLANAR_Y);
src_pitch = src->GetPitch(PLANAR_Y);
height = src->GetHeight(PLANAR_Y);
row_size = src->GetRowSize(PLANAR_Y);
}
void GetPlaneU()
{
dstp = dst->GetWritePtr(PLANAR_U);
dst_pitch = src->GetPitch(PLANAR_U);

srcc = src->GetReadPtr(PLANAR_U);
srcp = src->GetReadPtr(PLANAR_U);
srcn = src->GetReadPtr(PLANAR_U);
src_pitch = src->GetPitch(PLANAR_U);
height = src->GetHeight(PLANAR_U);
row_size = src->GetRowSize(PLANAR_U);
}
void GetPlaneV()
{
dstp = dst->GetWritePtr(PLANAR_V);
dst_pitch = src->GetPitch(PLANAR_V);

srcc = src->GetReadPtr(PLANAR_V);
srcp = src->GetReadPtr(PLANAR_V);
srcn = src->GetReadPtr(PLANAR_V);
src_pitch = src->GetPitch(PLANAR_V);
height = src->GetHeight(PLANAR_V);
row_size = src->GetRowSize(PLANAR_V);
}
void CopyPlane()
{
memcpy(dstp, srcc, src_pitch * height);
}
//CORE FILTER FUNCTIONS;
__forceinline int clamp(int x)
{
return max(0, min(255, x));
}
__forceinline int fast_sqrt(const int n)
{
for (int j = 2;;j++)
{
if (j * j > n) return j - 1;
}
}
__forceinline int fast_atan(const float n)
{
return 1.57f - 1 / n;
}
inline int AddDiff0(int x, int y, int nt, int str)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
j = j < nt ? 0 : j;
if (j == 0) return x;

//BOOST DIFFERENCE AND CHANGE SIGN;
j = j * str * k;

//CLAMP DIFFERENCE
j = clamp(x + j);

return j;
}
inline int AddDiff1(int x, int y, int nt, int str)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
j = j < nt ? 0 : j;
if (j == 0) return x;

//BOOST DIFFERENCE;
j = j * str;

//REDUCE DIFFERENCE;
j = fast_sqrt(j);

//CHANGE SIGN;
j = j * k;

//CLAMP DIFFERENCE
j = clamp(x + j);

return j;
}
inline int AddDiff2(int x, int y, int nt, int str)
{
int i, j, k;

//GETS DIFF, ABS DIFF, SIGN DIFF;
i = x - y;
j = abs(i);
k = (i > 0) - (i < 0);

//SET TO 0 LOW FREQUENCY;
j = j < nt ? 0 : j;
if (j == 0) return x;

//BOOST DIFFERENCE;
j = j * str;

//REDUCE DIFFERENCE;
j = int(fast_sqrt(j) * fast_atan(j));

//CHANGE SIGN;
j = j * k;

//CLAMP DIFFERENCE
j = clamp(x + j);

return j;
}
void BoxBlur(int nt, int str, int smode)
{
int x, y, i;

//START PITCH;
srcp -= src_pitch;
srcn += src_pitch;

//COPY FIRST ROW;
memcpy(dstp, srcc, row_size);
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;

//FILTER MAIN ROWS;
if (smode == 0)
{
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
#pragma omp simd
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] + srcp[x + 1];
i += srcc[x - 1] + srcc[x] + srcc[x + 1];
i += srcn[x - 1] + srcn[x] + srcn[x + 1];
dstp[x] = AddDiff0(srcc[x], int(i * 0.1111f + 0.5f), nt, str);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}
}
else if (smode == 1)
{
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
#pragma omp simd
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] + srcp[x + 1];
i += srcc[x - 1] + srcc[x] + srcc[x + 1];
i += srcn[x - 1] + srcn[x] + srcn[x + 1];
dstp[x] = AddDiff1(srcc[x], int(i * 0.1111f + 0.5f), nt, str);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}
}
else if (smode == 2)
{
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
#pragma omp simd
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] + srcp[x + 1];
i += srcc[x - 1] + srcc[x] + srcc[x + 1];
i += srcn[x - 1] + srcn[x] + srcn[x + 1];
dstp[x] = AddDiff2(srcc[x], int(i * 0.1111f + 0.5f), nt, str);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}
}

//COPY LAST ROW;
for (x = 0; x < row_size; x++)
{
memcpy(dstp, srcc, row_size);
}
}
void GaussBlur(int nt, int str, int smode)
{
int x, y, i;

//START PITCH;
srcp -= src_pitch;
srcn += src_pitch;

//COPY FIRST ROW;
memcpy(dstp, srcc, row_size);
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;

//FILTER MAIN ROWS;
if (smode == 0)
{
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
#pragma omp simd
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1];
i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2;
i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1];
dstp[x] = AddDiff0(srcc[x], i >> 4, nt, str);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}
}
else if (smode == 1)
{
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
#pragma omp simd
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1];
i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2;
i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1];
dstp[x] = AddDiff1(srcc[x], i >> 4, nt, str);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}
}
else if (smode == 2)
{
for (y = 1; y < height - 1; y++)
{
dstp[0] = srcc[0];
dstp[row_size - 1] = srcc[row_size - 1];
#pragma omp simd
for (x = 1; x < row_size - 1; x++)
{
i = srcp[x - 1] + srcp[x] * 2 + srcp[x + 1];
i += srcc[x - 1] * 2 + srcc[x] * 4 + srcc[x + 1] * 2;
i += srcn[x - 1] + srcn[x] * 2 + srcn[x + 1];
dstp[x] = AddDiff2(srcc[x], i >> 4, nt, str);
}
dstp += dst_pitch;
srcp += src_pitch;
srcc += src_pitch;
srcn += src_pitch;
}
}

//COPY LAST ROW;
memcpy(dstp, srcc, row_size);
}
void CoreFilter(int nt, int str, int smode, int bmode)
{
if (!bmode)
{
BoxBlur(nt, str, smode);
}
else
{
GaussBlur(nt, str, smode);
}

}
};

class CSharpen : public GenericVideoFilter
{
int str, smode, bmode, nt;
bool Y, U, V;
public:
CSharpen(PClip _child, int _str, int _smode, int _bmode, int _nt, bool _Y, bool _U, bool _V, IScriptEnvironment* env) : GenericVideoFilter(_child), str(_str), nt(_nt), smode(_smode), bmode(_bmode), Y(_Y), U(_U), V(_V)
{
if (!vi.IsYUV())
{
env->ThrowError("CSharpen: supported colorspaces are Y8, YV12, YV16, YV24!");
}
else if (nt < 0 || nt > 255)
{
env->ThrowError("CSharpen: nt avaible range is [0, 255]!");
}
else if (str < 1 || str > 255)
{
env->ThrowError("CSharpen: str avaible range is [1, 255]!");
}
else if (smode < 0 || smode > 2)
{
env->ThrowError("CSharpen: mode avaible mode values are 0, 1, 2!");
}
else if (bmode < 0 || bmode > 1)
{
env->ThrowError("CSharpen: mode avaible mode values are 0, 1!");
}
}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{

CSharpenFilter Frame;
Frame.GetFrame(child, n, vi, env);

Frame.GetPlaneY();
Y ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane();

if (!vi.IsY8())
{
Frame.GetPlaneU();
U ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane();
Frame.GetPlaneV();
V ? Frame.CoreFilter(nt, str, smode, bmode) : Frame.CopyPlane();
}

return Frame.dst;
}
};

const AVS_Linkage* AVS_linkage = 0;

AVSValue __cdecl Create_CSharpen(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new CSharpen(args[0].AsClip(),args[1].AsInt(10),args[2].AsInt(2),args[3].AsInt(1),args[4].AsInt(3),args[5].AsBool(true),args[6].AsBool(false),args[7].AsBool(false),env);
}

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("CSharpen", "c[str]i[smode]i[bmode]i[nt]i[Y]b[U]b[V]b", Create_CSharpen, 0);
return "CSharpen";
}

Can I ask you the big favor to HBD InvertNeg (http://avisynth.nl/index.php/Filter_SDK/InvertNeg) which is very very simple, so I can learn, since I can't figure it out myself and other users might benefit from it. I need to nail at least HBD so I can HBD my plugin pack and update them. Optimization will get me sometime since I can't even enable visual studio to recognize a _m128 variable :p

kedautinh12

27th January 2023, 06:05

You can try other build for speed clang, gcc,... And after CShapern HBD, optimize. Can you try with CTools HBD, optimize??

Ceppo

27th January 2023, 07:16

Thanks is the whole point of this filter :D

DTL

27th January 2023, 09:07

To add HBD you add copy of your processing functions with 16bit and float32 input and output for source and target planes. The copy over templating may be recommended because of later adding different SIMD to different functions.
At plugin startup you place selector based on input bitdepth to select the function to use. May be the source for selector to detect the input bitdepth may be vi.componentsize of the input clip.
As I see from MAnalyse:

pixelsize = vi.ComponentSize();

So if vi.ComponentSize() == 1 - you use your current 8bit functions,
if vi.ComponentSize() == 2 - you use 16bit (unsigned short pointers),
if vi.ComponentSize() == 4 - you use float32 (float pointers and float processing).
Make copy of your CoreFilter() function like CoreFilter_8() , CoreFilter_16() and CoreFilter_Float(). I think all >8 to 16bit like 10,14,16 may be processed with single 16bit function.

For

__forceinline int clamp(int x)
{
return max(0, min(255, x));
}

it looks you need to read bits_per_pixel = vi.BitsPerComponent(); in your class members and make clamp like

__forceinline int clamp_int(int x)
{
return max(0, min(((1 << bits_per_pixel) - 1), x));
}
__forceinline float clamp_float(float x)
{
return max(0, min(1.0f, x));
}

The GetRead/GetWrite ptrs functions may always return unsigned char pointer so you cast it to unsigned short or float if your ComponentSize not 8 but 10..16 or float.
The Pitch may be 'real' bytes measured so you need to adjust it if use as operand with short and float pointers or C compiler will make datasize calculation. So for short pointer num_of_shorts=pitch/2 and for float pointer num_of_floats=pitch/4. RowSize looks like the same.

So for 16bit you change 'unsigned char' type of input and output pointers to 'unsigned short' and for float32 to 'float'. Also for float version of function change internal processing to float. For 16bit input/output the int is enough I think.

To use SIMD with intrinsics you typically add immintrin.h include and with not very old VisualStudio can use up to AVX2 (or may be including many AVX512) types and pseudofunctions. Example of simple C that I use to design and debug SIMD functions is https://github.com/DTL2020/Sub_shifting/blob/main/Asm_test002.cpp
it have only

#include <immintrin.h> // MS version of immintrin.h covers AVX, AVX2 and FMA3

and it is enough to compile with SIMD intrinsics.
The 128bit integer type is __m128i.
I typically use online (or downloadable offline available) intrinsics web-help like https://www.laruence.com/sse/ .

#pragma omp before for() loop make you local internal multithreading (cut row to threads) but the 'degree of SIMD' need to be checked in disassembler. Also the too small workunits for MT may be not very good so you can MT the sets of rows (as work in internal MT in mvtools) - try to put #pragma omp before y-loop. But for MT you need local pointers and local variables for each thread - something like

#pragma omp simd for (y = 1; y < height - 1; y++)
{
// use local data each thread
int (short, float..) data;

// calculate each thread ptrs from y-var
unsigned char* l_dstp = dstp + y * dst_pitch; // unsigned short* and float* for 16bit and float32
unsigned char* l_srcp = srcp + y * src_pitch;
unsigned char* l_srcc = srcc + y * src_pitch;
unsigned char* l_srcn = srcn + y * src_pitch;

// dstp[0] = srcc[0];
l_dstp[0] = l_srcc[0]; // and so all others ptrs
l_dstp[row_size - 1] = l_srcc[row_size - 1];

for (x = 1; x < row_size - 1; x++)
{
data = l_srcp[x - 1] + l_srcp[x] + l_srcp[x + 1];
data += l_srcc[x - 1] + l_srcc[x] + l_srcc[x + 1];
data += l_srcn[x - 1] + l_srcn[x] + l_srcn[x + 1];
l_dstp[x] = AddDiff0(l_srcc[x], int(i * 0.1111f + 0.5f), nt, str);
}
}

Also you can view 'new OpenMP syntax' in JincResize - it may be more effective - https://github.com/Asd-g/AviSynth-JincResize/blob/master/src/resize_plane_sse41.cpp

Updated: replaced array of ptrs into MT part to locally calculated ptrs.

StainlessS

27th January 2023, 10:18

From the little I remember about C++ (about 4 weeks of study back in 1996), when you fully define a member function inside a class declaration,
it is a hint for the complier that you want it in-lined {the compiler is not compelled to inline it, and may only do it if a reasonably small function}.
However, C++ has changed a bit since the 90's.

Thanks for the thread guys, is quite interesting, and maybe a potential sticky contender for optimising code for HBD and SIMD.

EDIT: "sticky contender", Ideally, it would have been a more simple filter like a simple Average() [or similar], to better concentrate on the optimisation.
EDIT: Or more simple Invert() style filter.

Reel.Deel

27th January 2023, 10:45

I'm not a programmer but I thought I'd share this. While working on the avs+ docs, I had to scroll back through the commit history. From there you can see how the code changed for all of the filters. For example, here is the first change pinterf did to the blur/sharpen filters to support 16-bit: https://github.com/AviSynth/AviSynthPlus/commit/827f370ea0388994216a0bf12c8f4d95b931c7f8. Here's the "luma" mode of Histogram when HBD was added: https://github.com/AviSynth/AviSynthPlus/commit/c945458c6a9198f2471236765a4eaaab12918e81. All of the internals filters have these changes to look at, starting when they were 8-bit only.

Ceppo

27th January 2023, 21:08

This SEEMS to work, but of course it's probably not how you are supposed to do it; BTW, thanks for all the info, when I figured out this HBD stuff, I will treasure them.

#include <windows.h>
#include <avisynth.h>

class InvertNeg : public GenericVideoFilter
{
public:
InvertNeg(PClip _child, IScriptEnvironment* env) : GenericVideoFilter(_child)
{
}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame dst = env->NewVideoFrame(vi);
PVideoFrame src = child->GetFrame(n, env);

auto c = (1 << vi.BitsPerComponent()) - 1;
int planes[] = { PLANAR_Y, PLANAR_V, PLANAR_U };

for (int p = 0; p < 3; p++)
{
auto srcp = src->GetReadPtr(planes[p]);
auto dstp = dst->GetWritePtr(planes[p]);
auto height = src->GetHeight(planes[p]) * vi.ComponentSize();
auto row_size = src->GetRowSize(planes[p]) / vi.ComponentSize();
auto src_pitch = src->GetPitch(planes[p]) / vi.ComponentSize();
auto dst_pitch = dst->GetPitch(planes[p]) / vi.ComponentSize();

for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = srcp[x] ^ c;
}
srcp += src_pitch;
dstp += dst_pitch;
}
}
return dst;
}
};

AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new InvertNeg(args[0].AsClip(), env);
}

const AVS_Linkage* AVS_linkage = 0;

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("InvertNeg", "c", Create_InvertNeg, 0);
return "InvertNeg sample plugin";
}

DTL

27th January 2023, 22:32

height = src->GetHeight(planes[p]) * vi.ComponentSize();

this looks like error. Number of lines in a frame and rows in a storage buffer do not depend on bitdepth. Only measured in 8bit bytes row length in memory and pitch.
With this line loop will run out of buffer very far (and will meet hardware memory protection error when run to the next 4kB memory page typically). It may temporarily work with very small frame sizes but cause memory corruption after actual buffer length.

As for 'auto' pointers types - I not sure if compiler really knows how many types do you need to support and may be not compile 'real' 3 different versions of functions. May be it will take only types of functions of unsigned char from AVS headers.

If you want to use 'templating' you need to declare template and 3 real functions of types unsigned char, unsigned short and float somewhere.

As I remember first we declare 'template function' like https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/Interpolation.h#L137 in the header. With pixel_t param as our bitdepth.
Next make function implementation - https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/Interpolation.cpp#L2791 using pixel_t as param of data type. It is example of 'universal HBD' C-function for all data types. See how src and dst accessed via pixel_t type:

pixel_t* pctDst = reinterpret_cast<pixel_t*>(pDst);
const pixel_t* pSrc;
pSrc = reinterpret_cast<const pixel_t*>(_pSrc) ...

pctDst[j * nDstPitch + i] = (pixel_t)fOut;

with default declared _pSrc and pDst types as unsigned char only.

In your 'one for all' HBD C-function you can use pixel_t type as conditional assignment of types of variables like https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/MDegrainN.cpp#L98

And next declare 3 real functions of 3 types to compile - https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/Interpolation.cpp#L4766 . So compiler will make 3 real copies of function to use.

Next at the class constructor you select the required type of function depends on 'pixelsize': https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/MVPlane.cpp#L86 . When AVS construct filtergraph it call class constructors and provide bitdepth to use. So at this point you can select the required version of function to use.

And call function by its pointer at processing time: https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/MVPlane.cpp#L693

Ceppo

28th January 2023, 01:01

This works with all bits depth :)

#include <windows.h>
#include <avisynth.h>

template<typename pixel_t>
void Invert(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits)
{
pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp);
const pixel_t* srcp = reinterpret_cast<const pixel_t*>(_srcp);

if (bits == 32)
{
for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = 1.0f - srcp[x];
}
dstp += dst_pitch;
srcp += src_pitch;
}
}
else
{
int MAX = (1 << bits) - 1;

for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = MAX - srcp[x];
}
dstp += dst_pitch;
srcp += src_pitch;
}
}
}

template void Invert<uint8_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits);
template void Invert<uint16_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits);
template void Invert<float>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits);

class InvertNeg : public GenericVideoFilter
{
public:
InvertNeg(PClip _child, IScriptEnvironment* env) : GenericVideoFilter(_child)
{
}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame dst = env->NewVideoFrame(vi);
PVideoFrame src = child->GetFrame(n, env);

auto srcp = src->GetReadPtr(PLANAR_Y);
auto dstp = dst->GetWritePtr(PLANAR_Y);
auto height = src->GetHeight(PLANAR_Y);
auto row_size = src->GetRowSize(PLANAR_Y) / vi.ComponentSize();
auto src_pitch = src->GetPitch(PLANAR_Y) / vi.ComponentSize();
auto dst_pitch = dst->GetPitch(PLANAR_Y) / vi.ComponentSize();

if (vi.ComponentSize() == 1)
{
Invert<uint8_t>(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent());
}
if (vi.ComponentSize() == 2)
{
Invert<uint16_t>(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent());
}
if (vi.ComponentSize() == 4)
{
Invert<float>(srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent());
}
return dst;
}
};

AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new InvertNeg(args[0].AsClip(), env);
}

const AVS_Linkage* AVS_linkage = 0;

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("InvertNeg", "c", Create_InvertNeg, 0);
return "InvertNeg sample plugin";
}

DTL

28th January 2023, 12:14

This is example of OpenMP internal MT ('threads' user control param with default 1) and SIMD up to AVX512 in float32 processing only (to make example shorter).

#include <windows.h>
#include "include\avisynth.h"
#include <immintrin.h>

template<typename pixel_t>
void Invert(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags)
{
pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp);
pixel_t* srcp = reinterpret_cast<pixel_t*>(_srcp);

if (bits == 32)
{
#pragma omp parallel for num_threads(threads)
for (int y = 0; y < height; y++)
{
pixel_t* l_dstp = dstp + y * dst_pitch;
pixel_t* l_srcp = srcp + y * src_pitch;

if (cpuFlags & CPUF_AVX512F) // use AVX512
{
float* pf_src = (float*)l_srcp;
float* pf_dst = (float*)l_dstp;
const int col64 = row_size - (row_size % 64); // use 4*16 512bit regs to load/store
__m512 zmm_fone = _mm512_set1_ps(1.0f);

for (int64_t col = 0; col < col64; col += 64)
{
__m512 zmm0 = _mm512_loadu_ps(pf_src); // better align start addr with pre-conversion of 32(64?)-bytes aligned (if exist) and use load_ps
__m512 zmm1 = _mm512_loadu_ps(pf_src + 16);
__m512 zmm2 = _mm512_loadu_ps(pf_src + 32);
__m512 zmm3 = _mm512_loadu_ps(pf_src + 48);

zmm0 = _mm512_sub_ps(zmm_fone, zmm0);
zmm1 = _mm512_sub_ps(zmm_fone, zmm1);
zmm2 = _mm512_sub_ps(zmm_fone, zmm2);
zmm3 = _mm512_sub_ps(zmm_fone, zmm3);

_mm512_storeu_ps(pf_dst, zmm0);
_mm512_storeu_ps(pf_dst + 16, zmm1);
_mm512_storeu_ps(pf_dst + 32, zmm2);
_mm512_storeu_ps(pf_dst + 48, zmm3);

pf_src += 64; // in floats
pf_dst += 64;
}

// last cols
for (int64_t col = col64; col < row_size; ++col)
{
*pf_dst = (pixel_t)(1.0f - *pf_src);
pf_dst++;
pf_src++;
}
}
else
if (cpuFlags & CPUF_AVX) // use AVX
{
float* pf_src = (float*)l_srcp;
float* pf_dst = (float*)l_dstp;
const int col32 = row_size - (row_size % 32); // use 4*8 256bit regs to load/store
__m256 ymm_fone = _mm256_set1_ps(1.0f);

for (int64_t col = 0; col < col32; col += 32)
{
__m256 ymm0 = _mm256_loadu_ps(pf_src); // better align start addr with pre-conversion of 32-bytes aligned (if exist) and use load_ps
__m256 ymm1 = _mm256_loadu_ps(pf_src + 8);
__m256 ymm2 = _mm256_loadu_ps(pf_src + 16);
__m256 ymm3 = _mm256_loadu_ps(pf_src + 24);

ymm0 = _mm256_sub_ps(ymm_fone, ymm0);
ymm1 = _mm256_sub_ps(ymm_fone, ymm1);
ymm2 = _mm256_sub_ps(ymm_fone, ymm2);
ymm3 = _mm256_sub_ps(ymm_fone, ymm3);

_mm256_storeu_ps(pf_dst, ymm0);
_mm256_storeu_ps(pf_dst + 8, ymm1);
_mm256_storeu_ps(pf_dst + 16, ymm2);
_mm256_storeu_ps(pf_dst + 24, ymm3);

pf_src += 32; // in floats
pf_dst += 32;
}

// last cols
for (int64_t col = col32; col < row_size; ++col)
{
*pf_dst = (pixel_t)(1.0f - *pf_src);
pf_dst++;
pf_src++;
}
}
else
if (cpuFlags & CPUF_SSE) // use SSE
{
float* pf_src = (float*)l_srcp;
float* pf_dst = (float*)l_dstp;
const int col16 = row_size - (row_size % 16); // use 4*4 128bit regs to load/store
__m128 xmm_fone = _mm_set1_ps(1.0f);

for (int64_t col = 0; col < col16; col += 16)
{
__m128 xmm0 = _mm_loadu_ps(pf_src); // better align start addr with pre-conversion of 16-bytes aligned (if exist) and use load_ps
__m128 xmm1 = _mm_loadu_ps(pf_src + 4);
__m128 xmm2 = _mm_loadu_ps(pf_src + 8);
__m128 xmm3 = _mm_loadu_ps(pf_src + 12);

xmm0 = _mm_sub_ps(xmm_fone, xmm0);
xmm1 = _mm_sub_ps(xmm_fone, xmm1);
xmm2 = _mm_sub_ps(xmm_fone, xmm2);
xmm3 = _mm_sub_ps(xmm_fone, xmm3);

_mm_storeu_ps(pf_dst, xmm0);
_mm_storeu_ps(pf_dst + 4, xmm1);
_mm_storeu_ps(pf_dst + 8, xmm2);
_mm_storeu_ps(pf_dst + 12, xmm3);

pf_src += 16; // in floats
pf_dst += 16;
}

// last cols
for (int64_t col = col16; col < row_size; ++col)
{
*pf_dst = (pixel_t)(1.0f - *pf_src);
pf_dst++;
pf_src++;
}
}
else // C-reference
for (int x = 0; x < row_size; x++)
{
l_dstp[x] = (pixel_t)(1.0f - l_srcp[x]);
}
}
}
else
{
int MAX = (1 << bits) - 1;

for (int y = 0; y < height; y++)
{
for (int x = 0; x < row_size; x++)
{
dstp[x] = MAX - srcp[x];
}
dstp += dst_pitch;
srcp += src_pitch;
}
}
}

template void Invert<uint8_t>(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags);
template void Invert<uint16_t>(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags);
template void Invert<float>(unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads, int cpuFlags);

class InvertNeg : public GenericVideoFilter
{
int threads;
int _cpuFlags;
public:
InvertNeg(PClip _child, int threads_, IScriptEnvironment* env) : GenericVideoFilter(_child), threads(threads_)
{
_cpuFlags = env->GetCPUFlags();
}
PVideoFrame __stdcall GetFrame(int n, IScriptEnvironment* env)
{
PVideoFrame dst = env->NewVideoFrame(vi);
PVideoFrame src = child->GetFrame(n, env);

auto srcp = src->GetReadPtr(PLANAR_Y);
auto dstp = dst->GetWritePtr(PLANAR_Y);
auto height = src->GetHeight(PLANAR_Y);
auto row_size = src->GetRowSize(PLANAR_Y) / vi.ComponentSize();
auto src_pitch = src->GetPitch(PLANAR_Y) / vi.ComponentSize();
auto dst_pitch = dst->GetPitch(PLANAR_Y) / vi.ComponentSize();

if (vi.ComponentSize() == 1)
{
Invert<uint8_t>((uint8_t*)srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads, _cpuFlags);
}
if (vi.ComponentSize() == 2)
{
Invert<uint16_t>((uint8_t*)srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads, _cpuFlags);
}
if (vi.ComponentSize() == 4)
{
Invert<float>((uint8_t*)srcp, dstp, src_pitch, dst_pitch, height, row_size, vi.BitsPerComponent(), threads, _cpuFlags);
}
return dst;
}
};

AVSValue __cdecl Create_InvertNeg(AVSValue args, void* user_data, IScriptEnvironment* env)
{
return new InvertNeg(args[0].AsClip(), args[1].AsInt(1), env);
}

const AVS_Linkage* AVS_linkage = 0;

extern "C" __declspec(dllexport) const char* __stdcall AvisynthPluginInit3(IScriptEnvironment * env, const AVS_Linkage* const vectors)
{
AVS_linkage = vectors;
env->AddFunction("InvertNeg", "c[threads]i", Create_InvertNeg, 0);
return "InvertNeg sample plugin";
}

SIMD parts uses only 4 'registers' load/proc/store to make text shorter, for best speed you may use up to 16-1 in SSE2 and AVX2 (x64 builds) and 32-1 at AVX512 (x64 builds). 1 is left for 1.0f or MAX member of subtraction.
It process Y-only format, no UV planes copy.

The 'one for all' processing function quickly become very complex to support different bitdepth/SIMDfamily so it is better to make separate functions and apply selector at the plugin init to function startup addr.

Edit: correct Y8 to Y-only format.

Ceppo

28th January 2023, 18:03

Inductive logic is not demostrative, but since I'm ignorant (:p) it's all I got. Now I did the AVX512 template, and implemented all the modes, guessing by induction as I said. Is it... correct?

template<typename pixel_t>
void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
{
pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp);
const pixel_t* srcp = reinterpret_cast<const pixel_t*>(_srcp);

if (bits == 32)
{
#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
float* line_dstp = dstp + y * dst_pitch;
float* line_srcp = srcp + y * src_pitch;
float* src_float_ptr = (float*)line_srcp;
float* dst_float_ptr = (float*)line_dstp;

__m512 vector_max_512 = _mm512_set1_ps(1.0f);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512 vector_src_00 = _mm512_loadu_ps(src_float_ptr);
__m512 vector_src_16 = _mm512_loadu_ps(src_float_ptr + 16);
__m512 vector_src_32 = _mm512_loadu_ps(src_float_ptr + 32);
__m512 vector_src_48 = _mm512_loadu_ps(src_float_ptr + 48);

vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00);
vector_src_16 = _mm512_sub_ps(vector_max_512, vector_src_16);
vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32);
vector_src_48 = _mm512_sub_ps(vector_max_512, vector_src_48);

_mm512_storeu_ps(dst_float_ptr , vector_src_00);
_mm512_storeu_ps(dst_float_ptr + 16, vector_src_16);
_mm512_storeu_ps(dst_float_ptr + 32, vector_src_32);
_mm512_storeu_ps(dst_float_ptr + 48, vector_src_48);

src_float_ptr += 64;
dst_float_ptr += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*dst_float_ptr = (float)(1.0f - *src_float_ptr);
dst_float_ptr++;
src_float_ptr++;
}
}
}
else if(bits == 16 || bits == 14 || bits == 12 || bits == 10)
{
auto MAX = (1 << bits) - 1;

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint16_t* line_dstp = dstp + y * dst_pitch;
uint16_t* line_srcp = srcp + y * src_pitch;
uint16_t* src_uint16_t_ptr = (uint16_t*)line_srcp;
uint16_t* dst_uint16_t_ptr = (uint16_t*)line_dstp;

__m512 vector_max_512 = _mm512_set1_ps(MAX);
auto row_size_mod128 = row_size - (row_size % 128);

for (auto column = 0; column < row_size_mod128; column += 128)
{
__m512 vector_src_00 = _mm512_loadu_ps(src_uint16_t_ptr);
__m512 vector_src_32 = _mm512_loadu_ps(src_uint16_t_ptr + 32);
__m512 vector_src_64 = _mm512_loadu_ps(src_uint16_t_ptr + 64);
__m512 vector_src_96 = _mm512_loadu_ps(src_uint16_t_ptr + 96);

vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00);
vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32);
vector_src_64 = _mm512_sub_ps(vector_max_512, vector_src_64);
vector_src_96 = _mm512_sub_ps(vector_max_512, vector_src_96);

_mm512_storeu_ps(dst_uint16_t_ptr , vector_src_00);
_mm512_storeu_ps(dst_uint16_t_ptr + 32, vector_src_32);
_mm512_storeu_ps(dst_uint16_t_ptr + 64, vector_src_64);
_mm512_storeu_ps(dst_uint16_t_ptr + 96, vector_src_96);

src_uint16_t_ptr += 128;
dst_uint16_t_ptr += 128;
}
for (auto column = row_size_mod128; column < row_size; column++)
{
*dst_uint16_t_ptr = (uint16_t)(*src_uint16_t_ptr ^ MAX);
dst_uint16_t_ptr++;
src_uint16_t_ptr++;
}
}
}
else
{
#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint8_t* line_dstp = dstp + y * dst_pitch;
uint8_t* line_srcp = srcp + y * src_pitch;
uint8_t* src_uint8_t_ptr = (uint8_t*)line_srcp;
uint8_t* dst_uint8_t_ptr = (uint8_t*)line_dstp;

__m512 vector_max_512 = _mm512_set1_ps(255);
auto row_size_mod256 = row_size - (row_size % 256);

for (auto column = 0; column < row_size_mod128; column += 256)
{
__m512 vector_src_000 = _mm512_loadu_ps(src_uint8_t_ptr);
__m512 vector_src_064 = _mm512_loadu_ps(src_uint8_t_ptr + 64);
__m512 vector_src_128 = _mm512_loadu_ps(src_uint8_t_ptr + 128);
__m512 vector_src_192 = _mm512_loadu_ps(src_uint8_t_ptr + 192);

vector_src_000 = _mm512_sub_ps(vector_max_512, vector_src_000);
vector_src_064 = _mm512_sub_ps(vector_max_512, vector_src_064);
vector_src_128 = _mm512_sub_ps(vector_max_512, vector_src_128);
vector_src_192 = _mm512_sub_ps(vector_max_512, vector_src_192);

_mm512_storeu_ps(dst_uint8_t_ptr , vector_src_000);
_mm512_storeu_ps(dst_uint8_t_ptr + 64 , vector_src_064);
_mm512_storeu_ps(dst_uint8_t_ptr + 128, vector_src_128);
_mm512_storeu_ps(dst_uint8_t_ptr + 192, vector_src_192);

src_uint8_t_ptr += 256;
dst_uint8_t_ptr += 256;
}
for (auto column = row_size_mod256; column < row_size; column++)
{
*dst_uint8_t_ptr = (uint8_t)(*src_uint8_t_ptr ^ 255);
dst_uint8_t_ptr++;
src_uint8_t_ptr++;
}
}
}
}

template void Invert_AVX512<uint8_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);
template void Invert_AVX512<uint16_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);
template void Invert_AVX512<float>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);

Or like that?

template<typename pixel_t>
void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
{
pixel_t* dstp = reinterpret_cast<pixel_t*>(_dstp);
const pixel_t* srcp = reinterpret_cast<const pixel_t*>(_srcp);

if (bits == 32)
{
#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
float* line_dstp = dstp + y * dst_pitch;
float* line_srcp = srcp + y * src_pitch;
float* src_float_ptr = (float*)line_srcp;
float* dst_float_ptr = (float*)line_dstp;

__m512 vector_max_512 = _mm512_set1_ps(1.0f);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512 vector_src_00 = _mm512_loadu_ps(src_float_ptr);
__m512 vector_src_16 = _mm512_loadu_ps(src_float_ptr + 16);
__m512 vector_src_32 = _mm512_loadu_ps(src_float_ptr + 32);
__m512 vector_src_48 = _mm512_loadu_ps(src_float_ptr + 48);

vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00);
vector_src_16 = _mm512_sub_ps(vector_max_512, vector_src_16);
vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32);
vector_src_48 = _mm512_sub_ps(vector_max_512, vector_src_48);

_mm512_storeu_ps(dst_float_ptr , vector_src_00);
_mm512_storeu_ps(dst_float_ptr + 16, vector_src_16);
_mm512_storeu_ps(dst_float_ptr + 32, vector_src_32);
_mm512_storeu_ps(dst_float_ptr + 48, vector_src_48);

src_float_ptr += 64;
dst_float_ptr += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*dst_float_ptr = (float)(1.0f - *src_float_ptr);
dst_float_ptr++;
src_float_ptr++;
}
}
}
else if(bits == 16 || bits == 14 || bits == 12 || bits == 10)
{
uint16_t MAX = (1 << bits) - 1;

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint16_t* line_dstp = dstp + y * dst_pitch;
uint16_t* line_srcp = srcp + y * src_pitch;
uint16_t* src_uint16_t_ptr = (uint16_t*)line_srcp;
uint16_t* dst_uint16_t_ptr = (uint16_t*)line_dstp;

__m512 vector_max_512 = _mm512_set1_ps(MAX);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512 vector_src_00 = _mm512_loadu_ps(src_uint16_t_ptr);
__m512 vector_src_32 = _mm512_loadu_ps(src_uint16_t_ptr + 32);

vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00);
vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32);

_mm512_storeu_ps(dst_uint16_t_ptr , vector_src_00);
_mm512_storeu_ps(dst_uint16_t_ptr + 32, vector_src_32);

src_uint16_t_ptr += 64;
dst_uint16_t_ptr += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*dst_uint16_t_ptr = (uint16_t)(*src_uint16_t_ptr ^ MAX);
dst_uint16_t_ptr++;
src_uint16_t_ptr++;
}
}
}
else
{
#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint8_t* line_dstp = dstp + y * dst_pitch;
uint8_t* line_srcp = srcp + y * src_pitch;
uint8_t* src_uint8_t_ptr = (uint8_t*)line_srcp;
uint8_t* dst_uint8_t_ptr = (uint8_t*)line_dstp;

__m512 vector_max_512 = _mm512_set1_ps(255);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512 vector_src_00 = _mm512_loadu_ps(src_uint8_t_ptr);

vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00);

_mm512_storeu_ps(dst_uint8_t_ptr , vector_src_00);

src_uint8_t_ptr += 64;
dst_uint8_t_ptr += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*dst_uint8_t_ptr = (uint8_t)(*src_uint8_t_ptr ^ 255);
dst_uint8_t_ptr++;
src_uint8_t_ptr++;
}
}
}
}

template void Invert_AVX512<uint8_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);
template void Invert_AVX512<uint16_t>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);
template void Invert_AVX512<float>(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads);

DTL

28th January 2023, 19:48

float* line_dstp = dstp + y * dst_pitch;
float* line_srcp = srcp + y * src_pitch;
float* src_float_ptr = (float*)line_srcp;
float* dst_float_ptr = (float*)line_dstp;

It was duplication of ptrs from initial type conversion to use first l_srcp* and l_dstp* in final C-only part. Now if you make standalone AVX function you can make local prts for threads in single op like

float* local_dstp = (float*)(dstp + y * dst_pitch);
float* local_srcp = (float*)(srcp + y * src_pitch);

and use as addressing in the intrinsics. l_ hints for local-pointer for thread in parallel block of program.

For

else if(bits == 16 || bits == 14 || bits == 12 || bits == 10)

block you cannot use float processing SIMD - you need to use integer.

For 16bit integer AVX512 SIMD op you need to use :
__m512i type,
_mm512_loadu_si512() load 512bit unaligned for load,
_mm512_subs_epu16() for 16bit subtraction
_mm512_storeu_si512() for unaligned store of 512bit dataword
_mm512_set1_epi16() for setting 16bit unsigned short MAX value to all 16bit positions of 512bit dataword.

For 8bit:
_mm512_set1_epi8()
and
_mm512_subs_epu8()
with same integer 512bit type and load/store.

Also columns loaded to one 512bit dataword is 512/16=32 for 10..16bit and 512/8=64. So even 4x512bits storage process 128..256 columns per pass. The residual columns for scalar proc may be too large with such big main SIMD loops so may be good to make some immediate ending process with some like 512 or 256bit single dataword after massive main proc before last columns with scalar operation.
So each row processing will be even 3 stages:
1. Main SIMD processing with large load and store to registerfile (hundreds or even thousands columns per loop pass, with 30 AVX512 'registers' and 64 8bit columns per 'register' load you can load 30*64=1920 columns at single load and store op).
2. Some medium size SIMD post-processing of dozens columns like 1 of 512 or lower register per loop pass. Some residual columns that not as much as required to fit in main 1. loop.
3. Last columns in C-only scalar processing.

It looks I quickly fix broken VisualStudio 2019 installation at my work place with uninstalling SDKs/DDKs after lots of driver development kits installation and will try to make 8 and 10..16 bit example too in a few hours later.

" I did the AVX512 template"

Now when you make separate functions for SIMD architectures it may be no need to make it 'templated' because you have 'bits' param passed and know conversion of bits-values to pointers types. And different bits processing is completely separated inside each SIMD-function. You may left 'templating' only for C-reference/only part of program.

So you can make

void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
{
if (bits == 32)
{
float* dstp = reinterpret_cast<float*>(_dstp);
float* srcp = reinterpret_cast<float*>(_srcp);
#pragma omp parallel for num_threads(threads)
<make 32bit load and proc and store>
}
else if(bits == 16 || bits == 14 || bits == 12 || bits == 10)
{
uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
uint16_t* srcp = reinterpret_cast<uint16_t*>(_srcp);
#pragma omp parallel for num_threads(threads)
<make 16bit load and proc and store>
else
{
uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
uint8_t* srcp = reinterpret_cast<uint8_t*>(_srcp);
#pragma omp parallel for num_threads(threads)
<make 8bit load and proc and store>
}
}

Ceppo

28th January 2023, 21:37

Thanks for the tips;
I noticed here https://www.laruence.com/sse/# that for AVX and SSE, some functions have not a _m256i, _m128i one, how do you handle 16bit and 8 bit cases?

DTL

28th January 2023, 21:41

"some functions have not a _m256i, _m128i one"

What are the examples ? If you do not have single op instruction or macro you make a sequence to workaround (and may make/define your own macro to save program text from repeating text).

The instructions sets of SSE and AVX eras are really smaller in compare with the large family of AVX512 instructions sets. So for some operations you need to design a sequence of old instructions (and it will be executed slower).

Also for 128bit and 256bit integer instructions you need to check (enable search and display) SSE2 and AVX2 instructions sets at https://www.laruence.com/sse Intrinsics Guide.

It is really a part of good program: If cpuFlags do not show AVX2 and SSE2 you (mostly probably) can not run integer 16bit and 8bit SIMD functions and need to run slow C-reference only. Though you can also design for 64bit MMX integer if have time. Check

CPUID Flags:

of the each instruction you use to save user from invalid instruction crash. We still have users with AVX-only chips without AVX2 so they can run only SSE2 version of integer processing functions.

To make things a bit shorter I typically check only AVX2 - if present we can run integer and float AVX. If not - check for SSE2, if not present - run C-reference. There is also some SSE3/SSSE3/4.1/4.2 but it is too many versions to make for very old chips if you like to make such degree of support for old chips.

Unfortunately AVX512 also have lots of family members and it is best practice to check presence of all used instructions.
https://i.imgur.com/kNyftsO.jpg

I think current software may be designed to the set of AVX512 covered by Zen4 area - may be most of todays desktop intel/AMD chips will support it.

Ceppo

28th January 2023, 21:56

Ok, I will try to make a workaround as you said. For now the code is this, I removed the template since, as you said, it was not needed in this case.

#include <windows.h>
#include <avisynth.h>
#include <immintrin.h>

void (*CoreFilterPtr)(const unsigned char*, unsigned char*, int, int, int, int, int, int);

void Invert(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
{
if (bits == 32)
{
float* dstp = reinterpret_cast<float*>(_dstp);
const float* srcp = reinterpret_cast<const float*>(_srcp);
#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
float* local_dstp = dstp + y * dst_pitch;
const float* local_srcp = srcp + y * src_pitch;

for (auto x = 0; x < row_size; x++)
{
local_dstp[x] = (float)(1.0f - local_srcp[x]);
}
}
}
else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
{
uint16_t max_pixel = (1 << bits) - 1;
uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint16_t* local_dstp = dstp + y * dst_pitch;
const uint16_t* local_srcp = srcp + y * src_pitch;

for (auto x = 0; x < row_size; x++)
{
local_dstp[x] = (uint16_t)(local_srcp[x] ^ max_pixel);
}
}
}
else
{
uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint8_t* local_dstp = dstp + y * dst_pitch;
const uint8_t* local_srcp = srcp + y * src_pitch;

for (auto x = 0; x < row_size; x++)
{
local_dstp[x] = (uint8_t)(local_srcp[x] ^ 255);
}
}
}
}

void Invert_AVX512(const unsigned char* _srcp, unsigned char* _dstp, int src_pitch, int dst_pitch, int height, int row_size, int bits, int threads)
{
if (bits == 32)
{
float* dstp = reinterpret_cast<float*>(_dstp);
const float* srcp = reinterpret_cast<const float*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
float* local_dstp = (float*)(dstp + y * dst_pitch);
float* local_srcp = (float*)(srcp + y * src_pitch);

__m512 vector_max_512 = _mm512_set1_ps(1.0f);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512 vector_src_00 = _mm512_loadu_ps(local_srcp);
__m512 vector_src_16 = _mm512_loadu_ps(local_srcp + 16);
__m512 vector_src_32 = _mm512_loadu_ps(local_srcp + 32);
__m512 vector_src_48 = _mm512_loadu_ps(local_srcp + 48);

vector_src_00 = _mm512_sub_ps(vector_max_512, vector_src_00);
vector_src_16 = _mm512_sub_ps(vector_max_512, vector_src_16);
vector_src_32 = _mm512_sub_ps(vector_max_512, vector_src_32);
vector_src_48 = _mm512_sub_ps(vector_max_512, vector_src_48);

_mm512_storeu_ps(local_dstp, vector_src_00);
_mm512_storeu_ps(local_dstp + 16, vector_src_16);
_mm512_storeu_ps(local_dstp + 32, vector_src_32);
_mm512_storeu_ps(local_dstp + 48, vector_src_48);

local_srcp += 64;
local_dstp += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*local_dstp = (float)(1.0f - *local_srcp);
local_dstp++;
local_srcp++;
}
}
}
else if(bits == 16 || bits == 14 || bits == 12 || bits == 10)
{
uint16_t max_pixel = (1 << bits) - 1;
uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint16_t* local_dstp = (uint16_t*)(dstp + y * dst_pitch);
uint16_t* local_srcp = (uint16_t*)(srcp + y * src_pitch);

__m512i vector_max_512 = _mm512_set1_epi16(max_pixel);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512i vector_src_00 = _mm512_loadu_si512(local_srcp);
__m512i vector_src_32 = _mm512_loadu_si512(local_srcp + 32);

vector_src_00 = _mm512_subs_epu16(vector_max_512, vector_src_00);
vector_src_32 = _mm512_subs_epu16(vector_max_512, vector_src_32);

_mm512_storeu_si512(local_dstp, vector_src_00);
_mm512_storeu_si512(local_dstp + 32, vector_src_32);

local_srcp += 64;
local_dstp += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
local_dstp++;
local_srcp++;
}
}
}
else
{
uint8_t* dstp = reinterpret_cast<uint8_t*>(_dstp);
const uint8_t* srcp = reinterpret_cast<const uint8_t*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint8_t* local_dstp = (uint8_t*)(dstp + y * dst_pitch);
uint8_t* local_srcp = (uint8_t*)(srcp + y * src_pitch);

__m512i vector_max_512 = _mm512_set1_epi8(255);
auto row_size_mod64 = row_size - (row_size % 64);

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512i vector_src_00 = _mm512_loadu_si512(local_srcp);

vector_src_00 = _mm512_subs_epu8(vector_max_512, vector_src_00);

_mm512_storeu_si512(local_dstp, vector_src_00);

local_srcp += 64;
local_dstp += 64;
}
for (auto column = row_size_mod64; column < row_size; column++)
{
*local_dstp = (uint8_t)(*local_srcp ^ 255);
local_dstp++;
local_srcp++;
}
}
}
}

DTL

28th January 2023, 22:36

For integers:

for (auto column = 0; column < row_size_mod64; column += 64)
{
__m512i vector_src_00 = _mm512_loadu_si512(local_srcp);
__m512i vector_src_32 = _mm512_loadu_si512(local_srcp + 32);

If program compile and run OK - do not reduce usage of register file because it can visibly reduce performance. The unaligned load instruction have significant startup (latency) penalty and good enough throughput. Like 8 clocktics latency at Icelake and only 0.5 CPI Throughput. It mean after 8 clocks delay at least 2 dispatch ports for load unaligned data from memory (cache) can provide 2 datawords of 512bit per cycle. So when you put more load instructions in a sequence you hide startup latency more and more.

So for best performance it is recommended to use as much space of register file as possible - grab and process more data in a loop pass. You can load up to 31 512bit dataholders without registerfile overload of AVX512 chip in x64 mode. The last 32-nd is left for your first subtract member. It is many lines of text but may be visibly faster in execution.

Like:

// process 'big' frames here
auto row_size_mod992 = row_size - (row_size % 992);

for (auto column = 0; column < row_size_mod992; column += 992)
{
// load 31 'registers' of 512bits in x64 build or 15 in x86 build
__m512i vector_src_00 = _mm512_loadu_si512(local_srcp);
__m512i vector_src_32 = _mm512_loadu_si512(local_srcp + 32);
...
__m512i vector_src_(992-32) = _mm512_loadu_si512(local_srcp + ..);

// make 31 subtractions of 512bits
vector_src_00 = _mm512_subs_epu16(vector_max_512, vector_src_00);
vector_src_32 = _mm512_subs_epu16(vector_max_512, vector_src_32);
...
vector_src_(992-32) = _mm512_subs_epu16(vector_max_512, vector_src_32);

// store 31 datawords of 512bit
_mm512_storeu_si512(local_dstp, vector_src_00);
_mm512_storeu_si512(local_dstp + 32, vector_src_32);
...

local_srcp += 992;
local_dstp += 992;
}
// now if frame width < 992 and/or if too much residual columns left - make smaller SIMD processing
auto row_size_mod128 = row_size - (row_size % 128);

for (auto column = row_size_mod992; column < row_size_mod128; column += 128)
{
// load 4 'registers' of 512bits
__m512i vector_src_00 = _mm512_loadu_si512(local_srcp);
__m512i vector_src_32 = _mm512_loadu_si512(local_srcp + 32);
...

// make 4 subtractions of 512bits
vector_src_00 = _mm512_subs_epu16(vector_max_512, vector_src_00);
vector_src_32 = _mm512_subs_epu16(vector_max_512, vector_src_32);
...

// store 4 datawords of 512bit
_mm512_storeu_si512(local_dstp, vector_src_00);
_mm512_storeu_si512(local_dstp + 32, vector_src_32);
...

local_srcp += 128;
local_dstp += 128;
}
// process last > 32 columns in single 512bit op
auto row_size_mod32 = row_size - (row_size % 32);
for (auto column = row_size_mod128; column < row_size_mod32; column += 32)
{
// load 1 'register' of 512bits
__m512i vector_src_00 = _mm512_loadu_si512(local_srcp);
vector_src_00 = _mm512_subs_epu16(vector_max_512, vector_src_00);
_mm512_storeu_si512(local_dstp, vector_src_00);

local_srcp += 32;
local_dstp += 32;
}
// finally process residual columns as scalar C...

So 3840 samples width frame you process with 3 passes of 992 samples load followed by 6 passes of 128 samples load and so on. Not 60 passes of 64 samples per pass loop.

Ceppo

28th January 2023, 22:49

OK, so my 128/256 mode was better than the 64 mode. But we have 720/640/320 width sources so should I stick to 128 and 256, or do you advice even more?

Now I'm going to show you something ugly. For AVX, I couldn't came up with a work around without using pure C++ code. So I thought, "let's convert to float and take half the pixels" does this make sense (will it work/speed up the process)? Also, I'm not sure if, (float*)(local_srcp) will return half the number of pixels, or something evil (I'm the champion on this) will happen.

else if (bits == 16 || bits == 14 || bits == 12 || bits == 10)
{
uint16_t max_pixel = (1 << bits) - 1;
uint16_t* dstp = reinterpret_cast<uint16_t*>(_dstp);
const uint16_t* srcp = reinterpret_cast<const uint16_t*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
uint16_t* local_dstp = (uint16_t*)(dstp + y * dst_pitch);
uint16_t* local_srcp = (uint16_t*)(srcp + y * src_pitch);

__m256 vector_max_256 = _mm256_set1_ps((float)max_pixel);
auto row_size_mod16 = row_size - (row_size % 16);

for (auto column = 0; column < row_size_mod16; column += 16)
{
//IF I CAST TO FLOAT* DO I GET HALF THE PIXELS OR SOMETHING UGLY?
//MAKES SENSE TO CONVERT 16bit TO 32bit FOR THE LINEAR ALGEBRA?
__m256 vector_src_00 = _mm256_load_ps((float*)(local_srcp));
__m256 vector_src_08 = _mm256_load_ps((float*)(local_srcp + 8));

vector_src_00 = _mm256_sub_ps(vector_max_256, vector_src_00);
vector_src_08 = _mm256_sub_ps(vector_max_256, vector_src_08);

_mm256_storeu_ps((float*)(local_srcp), vector_src_00);
_mm256_storeu_ps((float*)(local_srcp + 8), vector_src_08);

local_srcp += 16;
local_dstp += 16;
}
for (auto column = row_size_mod16; column < row_size; column++)
{
*local_dstp = (uint16_t)(*local_srcp ^ max_pixel);
local_dstp++;
local_srcp++;
}
}
}

I might make no sense but I'm not a programmer. (This is my excuse :P)

DTL

28th January 2023, 22:59

"do you advice even more?"

Most slowdown happens on something like UHDTV sized frames. As you see even small 4K frame width can not be loaded in the 2KBytes register file of AVX512 in single load sequence. And users may try 8K and more.

"For AVX, I couldn't came up with a work around without using pure C++ code."

For AVX-only chip you can go down to SSE2 and still use 128bit SIMD. The 256bit AVX with 32bits floats will process same data per pass - only 8 floats per 'register'.
With SSE2 and 128bit wide datawords you can load same 8 16bit samples per 'register' but save lots of time on converting to float and back.

For SSE2 integer up to 16bits use
_mm_loadu_si128()
_mm_subs_epu16() for 16bit and _mm_subs_epu8() for 8bit
_mm_storeu_si128()
and __m128i type

__m256 vector_src_00 = _mm256_load_ps((float*)(local_srcp));

You can not convert short integer to float at load. Only load shorts as integers, expand 16bit to 32bit datawidth and use convert integer to float _mm256_cvtepi32_ps() (slow).

Ceppo

29th January 2023, 00:18

DTL, thanks for all your help! All this new knowledge is getting me very exited! To get the best out of your help I'm going to re read everything carefully, and take notes like a good student :) so that I can at least explain myself correctly and in a more detailed way (and hopefully best understand all your inputs, because you might have noticed by my answers that I fail to understand pretty often). I hope to fix my ignorance so that we can better communicate. :)

UPDATE: I have a Intel (R) Core (TM) i7-6500U CPU which supports only SSE4.1, SSE4.2 and AVX2, so I can't debug other optimizations. :( I'm still willing to make AVX512 if someone checks if the dll works as intended ;)

kedautinh12

29th January 2023, 01:37

I think DTL already had AVX-512 CPU :D

guest

29th January 2023, 02:45

I think DTL already had AVX-512 CPU :D

So do I, but I probably wouldn't be too much help :(

Ceppo

29th January 2023, 05:20

Fully working HBD InvertNeg with SSE4 and AVX2 optimization :D

https://pastebin.com/enHEmuq9

DTL

29th January 2023, 05:43

UPDATE: I have a Intel (R) Core (TM) i7-6500U CPU which supports only SSE4.1, SSE4.2 and AVX2, so I can't debug other optimizations. :( I'm still willing to make AVX512 if someone checks if the dll works as intended ;)

For up to VisualStudio 2017 you can install Intel SDE and Visual Studio addon
https://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=&cad=rja&uact=8&ved=2ahUKEwiB-46Q--v8AhU6hP0HHfr-BzsQFnoECBIQAQ&url=https%3A%2F%2Fwww.intel.com%2Fcontent%2Fwww%2Fus%2Fen%2Fdeveloper%2Farticles%2Ftool%2Fsoftware-development-emulator.html&usg=AOvVaw3aV6DNMjZxROFaLjARUV9U

Also may be you can configure SDE run of debug build with newer versions of Visual Studio (not tried yet). Though if Intel company still exist and provide versions from 2022 it may be somehow simply integrated with newer versions of Visual Studio without installing addon.

So you can develop and debug at the CPU without AVX512.

Simple test run may be with standalone SDE install to check if it executes OK.

I think DTL already had AVX-512 CPU :D

I have VS2017 with SDE addon installed at SSE2 CPU and it enough for development. For speed test I have access to AVX512 CPUs at work.

StainlessS

29th January 2023, 06:40

Thanks guys, I'm too stupid to understand most of it, and that is probably a good reason for this to be stickied,
so I [and others, maybe] can come back to it at some future date and try figure it all out.
(Not just me, but loadsa guys. [EDIT: 'Loadsa' = 'Loads of' = "lots of"]), and I'm guessin that I aint the only stupid guy here.
I'd rather hope that there are other stupids here, I dont wanna [EDIT: want to] be unique.

Thanks both of you, we are lovin' this.
Especial thanks to DTL, an artist at work, thank you.

DTL

29th January 2023, 07:15

Fully working HBD InvertNeg with SSE4 and AVX2 optimization :D

https://pastebin.com/enHEmuq9

It is very good to add at least 'single register SIMD' immediate processing after 'massive full-registerfile SIMD' and before pure C scalar ending. As described in https://forum.doom9.org/showthread.php?p=1981979#post1981979 .
In this version the frame widths below 112/224/448 will not be processed with SIMD part at all. Also the frame widths of (448*2)-1 for example will got significant penalty of first 448 columns processed fast with single SIMD pass and last 447 will be slow scalar C with 447 loop spins.
For most smooth performance over arbitrary frame widths it is good to have several SIMD stages like 'full-registerfile / half / may be 1/4 and single dataword'. The Invert with very easy subtraction is sort of rare case - in most of other processing you have much more usage of registerfile with different variables and constants and do not have so much space for different size load/stores.

Your SSE4 function looks not use any instrustions above SSE2 so you can lower CPU limit to SSE2 and use it at wider number of CPUs (having SSE2 but not SSE4).

You not need to double-init pointers now like

float* dstp = reinterpret_cast<float*>(_dstp);
const float* srcp = reinterpret_cast<const float*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
float* local_dstp = (float*)(dstp + y * dst_pitch);
float* local_srcp = (float*)(srcp + y * src_pitch);

you can directly init local threads pointers from function arguments like

// float* dstp = reinterpret_cast<float*>(_dstp);
// const float* srcp = reinterpret_cast<const float*>(_srcp);

#pragma omp parallel for num_threads(threads)
for (auto y = 0; y < height; y++)
{
float* local_dstp = (float*)(reinterpret_cast<float*>(_dstp) + y * dst_pitch);
float* local_srcp = (float*)(reinterpret_cast<float*>(_srcp) + y * src_pitch);

and save some lines of text.

if (!vi.IsY()) env->ThrowError("InvertNeg: Only Y8 input, sorry!");

More correct error message is about Y-only. Y8 may be Y-only with 8bit samples only.

You can auto-detect avaialable CPU features with env->GetCPUFlags(); and if user not provide cpu-param (like 0 - default) - use max available SIMD. For debug or other reasons user may lower max CPU features with SetMaxCPU() script command. And env->GetCPUFlags(); will return disabled upper or all flags.

if (threads < 1) env->ThrowError("InvertNeg: threads must be >= 1!");

0 may be 'auto' or 'all avaialble' so the invalid value will be < 0.
To make '0 - auto' example from https://github.com/Asd-g/AviSynth-JincResize/blob/b7fbf5d680a2950dff65b907134e6719efd11916/src/JincResize.cpp#L583

#include <thread>
const int thr = std::thread::hardware_concurrency();
if (threads_ == 0)
threads_ = thr;
else if (threads_ < 0 || threads_ > thr)
{
const std::string msg = ": threads must be between 0.." + std::to_string(thr) + ".";
env->ThrowError(msg.c_str());
}

Reel.Deel

29th January 2023, 07:44

Thanks guys, I'm too stupid to understand most of it, and that is probably a good reason for this to be stickied,
so I [and others, maybe] can come back to it at some future date and try figure it all out.
(Not just me, but loadsa guys. [EDIT: 'Loadsa' = 'Loads of' = "lots of"]), and I'm guessin that I aint the only stupid guy here.
I'd rather hope that there are other stupids here, I dont wanna [EDIT: want to] be unique.

Thanks both of you, we are lovin' this.
Especial thanks to DTL, an artist at work, thank you.

I added a link to this page on the InvertNeg wiki page: http://avisynth.nl/index.php/Filter_SDK/InvertNeg#Optimization_and_HDB_Support

Don't worry StainlessS, you're not alone. I'm right there beside you, I'm probably even a few more levels of stupid :D

Ceppo

29th January 2023, 15:56

DTL, I'm going to update it, as soon as possible! And figure out this emulation stuff :) BTW I have a NVIDIA GeForce GTX 950M, can we GPU InvertNeg for example reference?

kedautinh12

29th January 2023, 15:59

NVIDIA need Cuda ver for speed :D

Reel.Deel

29th January 2023, 16:28

... BTW I have a NVIDIA GeForce GTX 950M, can we GPU InvertNeg for example reference?

Here's InverNegCUDA by nekopanda: https://github.com/nekopanda/AviSynthPlusCUDASample

BUT, it's an example of Nekopanda's interface which is not compatible with current AviSynth+. They used to be compatible some versions ago but I don't know why they're not anymore. Here's a bit more info: https://github.com/AviSynth/AviSynthPlus/issues/296

Ceppo

29th January 2023, 17:22

Well, FFT3DGPU works on my graphic card. So, as far I understand, it's a metter on how you make communicate the GPU with avisynth+... I will look into FFT3dGPU code to see if I can figure out something...

UPDATE: NOPE, I don't understand pinterf codes, it's to much a PROgrammer for me :D

DTL

29th January 2023, 17:25

I have a NVIDIA GeForce GTX 950M, can we GPU InvertNeg for example reference?

I have only experience with DirectX12 and compute shader to process. The compute shader itself for inverting is very simple. But DX12 resources init and data upload/download to/from HWAcc is not very simple.

You can look into MAnalyse: https://github.com/DTL2020/mvtools/blob/mvtools-pfmod/Sources/MVAnalyse.cpp

The DX12 init is in the function https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/MVAnalyse.cpp#L1600 (2400-1600= about 800 lines of C, for 2 textures and 1 shader may be about 400..500 lines).

And GetFrame() usage at https://github.com/DTL2020/mvtools/blob/7fdb122d05d7378386e315a558f5786f84499783/Sources/MVAnalyse.cpp#L812

For Invert example you need only 2 texture resources for upload and download and 1 compute shader resource. The shader is simple C-like function for each sample: https://github.com/DTL2020/mvtools/blob/mvtools-pfmod/Sources/Compute.hlsl

All required for DX12 headers are located in DX12_ME ifdef so easy to see in https://github.com/DTL2020/mvtools/blob/mvtools-pfmod/Sources/MVAnalyse.h

Like:

#if defined _WIN32 && defined DX12_ME

#include <initguid.h>
#include <d3d12.h>
#include <dxgi1_6.h>
#include <D3Dcompiler.h>
#include <DirectXMath.h>
#include "d3dx12.h"
#include "d3d12video.h"
#include "DirectXHelpers.h"
#include "ReadData.h"
#include "DescriptorHeap.h"

#include <string>
#include <wrl.h>
#include <shellapi.h>

using Microsoft::WRL::ComPtr;
using namespace DirectX;

#endif

for includes.

Ceppo

29th January 2023, 20:00

I did the code for the AVX512F version, with the progressive reduction of the number of registers, however when I debug I get "illegal instruction". I don't understand if I installed wrong the intel emulator or if it is not compatible with MSVS 2022 or the code is bugged. Here the code:
https://pastebin.com/3hXP4Q9x

DTL

29th January 2023, 20:06

SDE typically install as standalone. And may have addon to VisualStudio. After installing addon you configure it with the path to SDE installation (or may be addon try to do it in auto way). After installing SDE addon to VS you see "SDE Debugger" debug startup avaialble (new item to dropdown list of Local Windows Debugger, Remote Windows debugger and others).
If you install standalone SDE and do not have addon to VS2022 - you may try to set startup of debug executable via command line SDE.

At least SDE should have run compiled executable from command line with all possible emulations enabled (default ?) without 'illegal instruction' error. After it run OK as standalone process - try to load it from VS IDE debug mode.

auto n = 8;
auto row_size_rst = row_size % (n*30);
auto row_size_mod = row_size - row_size_rst;

__m512 vector_max = _mm512_set1_ps(1.0f);

for (auto column = 0; column < row_size_mod; column += (n * 30))
{
__m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
__m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);

For AVX512 n should be 16 - the 512bit dataword hold 16 floats of 32bit each.

Also

auto row_size_rst = row_size % (n*30);
for (auto column = 0; column < row_size_mod; column += (n * 30))
last load - __m512 vector_src_30 = _mm512_loadu_ps(local_srcp + n * 30); - it is 31 loaded dataword starting from 0.

if you load 31 datawords in a pass so may be loop step should be 31 and row_size_rst = row_size % (n*31); . And same for all other loop steps (need +1) and row_size_rst and _mod calculation. And same with local pointers advancing.

So correct for 4 datawords per pass should be

row_size_mod = row_size_rst - (row_size_rst % (n * 4));
row_size_rst = row_size_rst % (n * 4);

for (auto column = 0; column < row_size_mod; column += (n * 4))
{
__m512 vector_src_00 = _mm512_loadu_ps(local_srcp + n * 0);
__m512 vector_src_01 = _mm512_loadu_ps(local_srcp + n * 1);
__m512 vector_src_02 = _mm512_loadu_ps(local_srcp + n * 2);
__m512 vector_src_03 = _mm512_loadu_ps(local_srcp + n * 3);

vector_src_00 = _mm512_sub_ps(vector_max, vector_src_00);
vector_src_01 = _mm512_sub_ps(vector_max, vector_src_01);
vector_src_02 = _mm512_sub_ps(vector_max, vector_src_02);
vector_src_03 = _mm512_sub_ps(vector_max, vector_src_03);

_mm512_storeu_ps(local_dstp + n * 0, vector_src_00);
_mm512_storeu_ps(local_dstp + n * 1, vector_src_01);
_mm512_storeu_ps(local_dstp + n * 2, vector_src_02);
_mm512_storeu_ps(local_dstp + n * 3, vector_src_03);

local_srcp += (n * 4);
local_dstp += (n * 4);
}

"with the progressive reduction of the number of registers"

I think it is better to skip some 'intermediate' step but make last step = 1 SIMD dataword. Now with last SIMD step for AVX512 of 4 dataword (each of 16 samples) you left last up to 63 samples to scalar C processing. Some more perfectionists SIMD programmers even add finer granularity SIMD steps after 'main dataword width' when avaialble. Like

// for AVX512 and float32 samples
// last full-dataword step = 1 and 16 samples per pass
// now use AVX256 (guaranteed supported by AVX512 chip) and process 8 samples per pass
// now use SSE128 (guaranteed supported by AVX512 chip) and process 4 samples per pass
// now process up to 3 residual samples with C-scalar program

Ceppo

29th January 2023, 21:02

I'm sorry to say this, but I must interrupt the discussion, and I'm going to disappear for like a month. Too complicated to explain why. :( (If someone wants to take over, please do).

DTL

29th January 2023, 21:46

It looks intel SDE even latest 9.14.0 only support full debugging integration only with VS2017. As readme says no support for VS2019 yet.

So if running in VS2019 as debug application it run AVSmeter and plugin but not provide modules information to VisualStudio and VS can not load symbols and can not break on breakpoints or crashes.

But you still can run emulation standalone to check application output and execution without crashes. If it crashes SDE report only address and crash type.

kedautinh12

30th January 2023, 03:25

Here Cuda code, hope you understand that :D
https://github.com/WolframRhodium/VapourSynth-BM3DCUDA/tree/avs+

DTL

12th March 2023, 02:31

Some interesting way of SIMD processing of rows without adding special processing for last columns if they are not integer divide to number of columns in SIMD 'workunit' size - https://github.com/Asd-g/AviSynth-vsTTempSmooth/blob/master/src/vsTTempSmooth_SSE2.cpp (same is for AVX2 and AVX512) .

It looks with not very old AVS core (?) the pitch of row is guaranteed to be integer divisor of alignment size so no long epilogue for process unknown count of residual samples at the end of row required. But it may be good to put an assert or even direct check at plugin init (class constructor) for the pitch size provided for buffers to process. If it can not be integer divided to required alignment size for SIMD processing it is better to throw stop error with description ?

DTL

1st May 2023, 10:30

Some longread about superscalar programming for SIMD:

Many new CPU chips (may be after 199x years already) have some limited capability of superscalar computing. It mean in some cases more computing may be performed at the same time (clock count). In chip design it is performed with several dispatch ports capable to execute same instruction. So the total computing performance on CPU is
Number_of_Cores x SIMD_datawidth x Superscalarity_factor

The number of cores and max SIMD dataword width is cleary visible from CPU hardware config and SIMD family (64/128/256/512 bit). The superscalarity factor depend on current chip design and depend on instruction and number of dispatch ports capable to dispatch compute instruction. For some groups of instructions superscalarity factor may be 2 and more directly noted in CPU specs - like 2 FMA units in some Xeons.

Generally for some program the superscalarity factor is >1 and for some instruction and some chips may reach 3. It may be found in the CPU documentation in the list of CPI per instruction (if CPI <1 it mean it is executed at 1 clocktick and 2 or more dispatch ports available, so CPI of 0.5 is 2 dispatch ports and CPI of 0.33 is 3 dispatch ports).

The required conditions for superscalar computing:
1. Data for computing must not have dependancy.
2. Data for computing mostly probably should be located in register file (reading memory even L1D cache is too slow).
3. There should be free to dispatch 2 or more ports supporting this instruction computing.

Example of possible to superscalar computing program:
a=b+c
d=e+f

Not possible (data dependant):
a=b+c
d=a+e

So in the SIMD programming to use possible benefit from superscalarity it is good to group big workunits of data (of several SIMD datawords) and if they are not dependant - group several compute instructions to process this data. So the instructions decode unit of CPU may detect it as superscalar ready part of program and route commands to several free and supporting dispatch ports.

Example of low or not superscalar friendly program processing loop:
for (int i=0; i < N; i++)
{
data_A=load(mem_A+i)
data_B=load(mem_B+i)
result=data_A+data_B
store(dst+i, result)
}

It use SIMD but process only one SIMD dataword per loop spin. If program designer is very lucky with compiler - it may unroll this loop to be more superscalar friendly. But it depends on compiler.

More superscalar-way of explicit programming is:
for (int i=0; i < N; i+=2)
{
dataA1=load(mem_A+i)
dataA2=load(mem_A+i+1)
dataB1=load(mem_B+i)
dataB2=load(mem_B+i+1)
result1=data_A1+data_B1
result2=data_A2+data_B2
store(dst+i, result1)
store(dst+i+1, result2)
}

It uses superscalarity factor of 2 if sum instruction is supported on 2 or more dispatch ports. Also there is less bus direction switches on load and store of data. It is expected with progress of CPU design the superscalarity factor for more and more instrucsions may be increased (may be to 4 and more) so it may be recommended to design SIMD programs supporting up to 4 and more dispatch ports in the same computing (depend on available space in register file and more).

The C-program text for superscalar computing is not very nice with lots of repeating blocks - may be it can be somehow compacted with language tools in more compact form.

DTL

13th May 2023, 03:11

Sample of simple colour-space converting plugin (YV12 to RGB32 decoding) using AVX2 optimization for both memory transfer and SIMD computing:
https://github.com/DTL2020/ConvertYV12toRGB/releases/tag/0.0.3

Can do both RGB planar (commented-out) or RGB32 interleaved store.

Also some addition to high-performance computing programming:

It looks dispatch ports of core are not support all range of instructions directly but designed as sort of FPGA with reloading of compute config to support all required instructions.

So when instruction decoder see some new instruction it performs:

1. Find free dispatch port supporting this instruction.
2. Check if port configured to dispatch.
3. If port not configured - load configuration of FPGA (takes several clocktics).
4. Route data and instruction code to dispatch to the port.

So instruction have 2 performance params: Latency and Throughput. The Latency used when it is first instruction in a sequence and no ready configured dispatch ports available. So first result will be ready only after Latency clockticks. If there are several equal instructions in a sequence - they can be pipelined to ready to dispatch port at Throughput performance. So it may be good to arrange many equal instructions in large sets to use Throughput performance level. Good compilers should do this work from intrinsics and VCL based C-programming if enough data to compute is prepared.