The DirectCompute kernel is shipping with madVR. The PS version for upscaling in X direction is here:

    "sampler SourceSampler : register(s0);\n"
    "sampler WeightSampler : register(s2);\n"
    "float4 floatConsts1 : register(c0);\n"
    "#define pixSizeX (floatConsts1[0])\n"
    "#define pixSizeY (floatConsts1[1])\n"
    "static float1 SumWeights1[nns] = (float1[nns]) packedSumWeights1Array;\n"
    "static float1 SumWeights2[nns] = (float1[nns]) packedSumWeights2Array;\n"
    "static float4x4 rgbToHd = {+0.2126000000000000, +0.7152000000000000, +0.0722000000000000, 0,\n"
    "                           -0.1145721060573400, -0.3854278939426600, +0.5000000000000000, 0,\n"
    "                           +0.5000000000000000, -0.4541529083058166, -0.0458470916941834, 0, 0, 0, 0, 0};\n"
    "float4 main(float2 Tex : TEXCOORD0) : COLOR0\n"
    "  float input[32];\n"
    "  float mstd0, mstd1, mstd2;\n"
    "  {\n"
    "    float sum = 0;\n"
    "    float sumsq = 0;\n"
    "    int index = 0;\n"
    "    float xpos = Tex.x - 1.0 * pixSizeX;\n"
    "    for (int ix = 0; ix < 4; ix++)\n"
    "    {\n"
    "      float ypos = Tex.y - 3.0 * pixSizeY;\n"
    "      for (int iy = 0; iy < 8; iy++)\n"
    "      {\n"
    "        float4 sample = tex2Dlod(SourceSampler, float4(xpos, ypos, 0, 0));\n"
    "        sample = (sample - 16.0f / 255.0f) / (219.0f / 255.0f);\n"   // d3d9Float8 16-235 -> 0-255
    "	       sample = mul(rgbToHd, sample) * 255.0;\n"
    "        ypos += pixSizeY;\n"
    "        input[index++] = sample[0];\n"
    "        sum += sample[0];\n"
    "        sumsq += sample[0] * sample[0];\n"
    "      }\n"
    "      xpos += pixSizeX;\n"
    "    }\n"
    "    mstd0 = sum / 32.0;\n"
    "    mstd1 = sumsq / 32.0 - mstd0 * mstd0;\n"
    "    mstd1 = (mstd1 <= 1.19209290e-07) ? 0.0 : sqrt(mstd1);\n"
    "    mstd2 = (mstd1 > 0) ? (1.0 / mstd1) : 0.0;\n"
    "  }\n"
    "  float vsum = 0;\n"
    "  float wsum = 0;\n"
    "  {\n"
    "    float ypos = 0.5 / nns;\n"
    "    for (int i1 = 0; i1 < nns; i1++)\n"
    "    {\n"
    "      float xpos = 0.5 / 16.0;\n"
    "      float sum1 = 0;\n"
    "      float sum2 = 0;\n"
    "      int index = 0;\n"
    "      for (int i2 = 0; i2 < 16; i2++)\n"
    "      {\n"
    "        float4 weights = tex1Dlod(WeightSampler, float4(xpos, ypos, 0, 0));\n"
    "        xpos += 1.0 / 16.0;\n"
    "        float sample = input[index++];\n"
    "        sum1 += sample * weights[0];\n"
    "        sum2 += sample * weights[1];\n"
    "        sample = input[index++];\n"
    "        sum1 += sample * weights[2];\n"
    "        sum2 += sample * weights[3];\n"
    "      }\n"
    "      ypos += 1.0 / nns;\n"
    "      float temp1 = sum1 * mstd2 + SumWeights1[i1];\n"
    "      float temp2 = sum2 * mstd2 + SumWeights2[i1];\n"
    "      temp1 = exp(clamp(temp1, -80.0, +80.0));\n"
    "      vsum += temp1 * (temp2 / (1.0 + abs(temp2)));\n"
    "      wsum += temp1;\n"
    "    }\n"
    "  }\n"
    "  float result = (mstd0 + ((wsum > 1e-10) ? (((5.0 * vsum) / wsum) * mstd1) : 0.0)) / 255.0;\n"
    "  return result * (219.0f / 255.0f) + 16.0f / 255.0f;\n"   // d3d9Float8 0-255 -> 16-235
This kernel needs the NNEDI3 weight "database" uploaded to the "WeightSampler" texture and the "SumWeights1/2" constants in the right order, though, and I don't have that information easy to share, unfortunately.
