Quote:
Originally Posted by Orf
can you please share your the PS and DirectCompute versions of nnedi3?
|
The DirectCompute kernel is shipping with madVR. The PS version for upscaling in X direction is here:
Code:
"sampler SourceSampler : register(s0);\n"
"sampler WeightSampler : register(s2);\n"
"float4 floatConsts1 : register(c0);\n"
"#define pixSizeX (floatConsts1[0])\n"
"#define pixSizeY (floatConsts1[1])\n"
"static float1 SumWeights1[nns] = (float1[nns]) packedSumWeights1Array;\n"
"static float1 SumWeights2[nns] = (float1[nns]) packedSumWeights2Array;\n"
"static float4x4 rgbToHd = {+0.2126000000000000, +0.7152000000000000, +0.0722000000000000, 0,\n"
" -0.1145721060573400, -0.3854278939426600, +0.5000000000000000, 0,\n"
" +0.5000000000000000, -0.4541529083058166, -0.0458470916941834, 0, 0, 0, 0, 0};\n"
"\n"
"float4 main(float2 Tex : TEXCOORD0) : COLOR0\n"
"{\n"
" float input[32];\n"
" float mstd0, mstd1, mstd2;\n"
" {\n"
" float sum = 0;\n"
" float sumsq = 0;\n"
" int index = 0;\n"
" float xpos = Tex.x - 1.0 * pixSizeX;\n"
" for (int ix = 0; ix < 4; ix++)\n"
" {\n"
" float ypos = Tex.y - 3.0 * pixSizeY;\n"
" for (int iy = 0; iy < 8; iy++)\n"
" {\n"
" float4 sample = tex2Dlod(SourceSampler, float4(xpos, ypos, 0, 0));\n"
" sample = (sample - 16.0f / 255.0f) / (219.0f / 255.0f);\n" // d3d9Float8 16-235 -> 0-255
" sample = mul(rgbToHd, sample) * 255.0;\n"
" ypos += pixSizeY;\n"
" input[index++] = sample[0];\n"
" sum += sample[0];\n"
" sumsq += sample[0] * sample[0];\n"
" }\n"
" xpos += pixSizeX;\n"
" }\n"
" mstd0 = sum / 32.0;\n"
" mstd1 = sumsq / 32.0 - mstd0 * mstd0;\n"
" mstd1 = (mstd1 <= 1.19209290e-07) ? 0.0 : sqrt(mstd1);\n"
" mstd2 = (mstd1 > 0) ? (1.0 / mstd1) : 0.0;\n"
" }\n"
" float vsum = 0;\n"
" float wsum = 0;\n"
" {\n"
" float ypos = 0.5 / nns;\n"
" for (int i1 = 0; i1 < nns; i1++)\n"
" {\n"
" float xpos = 0.5 / 16.0;\n"
" float sum1 = 0;\n"
" float sum2 = 0;\n"
" int index = 0;\n"
" for (int i2 = 0; i2 < 16; i2++)\n"
" {\n"
" float4 weights = tex1Dlod(WeightSampler, float4(xpos, ypos, 0, 0));\n"
" xpos += 1.0 / 16.0;\n"
" float sample = input[index++];\n"
" sum1 += sample * weights[0];\n"
" sum2 += sample * weights[1];\n"
" sample = input[index++];\n"
" sum1 += sample * weights[2];\n"
" sum2 += sample * weights[3];\n"
" }\n"
" ypos += 1.0 / nns;\n"
" float temp1 = sum1 * mstd2 + SumWeights1[i1];\n"
" float temp2 = sum2 * mstd2 + SumWeights2[i1];\n"
" temp1 = exp(clamp(temp1, -80.0, +80.0));\n"
" vsum += temp1 * (temp2 / (1.0 + abs(temp2)));\n"
" wsum += temp1;\n"
" }\n"
" }\n"
" float result = (mstd0 + ((wsum > 1e-10) ? (((5.0 * vsum) / wsum) * mstd1) : 0.0)) / 255.0;\n"
" return result * (219.0f / 255.0f) + 16.0f / 255.0f;\n" // d3d9Float8 0-255 -> 16-235
"}";
This kernel needs the NNEDI3 weight "database" uploaded to the "WeightSampler" texture and the "SumWeights1/2" constants in the right order, though, and I don't have that information easy to share, unfortunately.