From b9b497fc1f6bba581d8de31d0f7a21ddac9d2a28 Mon Sep 17 00:00:00 2001 From: kaetemi Date: Mon, 4 Aug 2014 04:44:27 +0200 Subject: [PATCH] 3D: Add FXAA --- code/nel/include/nel/3d/fxaa.h | 4 +- code/nel/src/3d/bloom_effect.cpp | 7 +- code/nel/src/3d/fxaa.cpp | 114 +- code/nel/src/3d/fxaa_program.h | 215 ++ code/nel/src/3d/shaders/compile.bat | 3 + code/nel/src/3d/shaders/fxaa3_11.h | 2042 +++++++++++++++++ code/nel/src/3d/shaders/fxaa_pp.cg | 70 + code/nel/src/3d/shaders/fxaa_pp_arbfp1.txt | 76 + code/nel/src/3d/shaders/fxaa_pp_ps_2_0.txt | 92 + code/nel/src/3d/shaders/fxaa_vp.cg | 20 + code/nel/src/3d/shaders/fxaa_vp_arbvp1.txt | 31 + code/nel/src/3d/shaders/readme.txt | 4 + code/ryzom/client/src/main_loop.cpp | 12 +- .../client/src/snowballs_client.cpp | 5 +- 14 files changed, 2661 insertions(+), 34 deletions(-) create mode 100644 code/nel/src/3d/shaders/compile.bat create mode 100644 code/nel/src/3d/shaders/fxaa3_11.h create mode 100644 code/nel/src/3d/shaders/fxaa_pp.cg create mode 100644 code/nel/src/3d/shaders/fxaa_pp_arbfp1.txt create mode 100644 code/nel/src/3d/shaders/fxaa_pp_ps_2_0.txt create mode 100644 code/nel/src/3d/shaders/fxaa_vp.cg create mode 100644 code/nel/src/3d/shaders/fxaa_vp_arbvp1.txt create mode 100644 code/nel/src/3d/shaders/readme.txt diff --git a/code/nel/include/nel/3d/fxaa.h b/code/nel/include/nel/3d/fxaa.h index b4c710b0c..f7ccf4866 100644 --- a/code/nel/include/nel/3d/fxaa.h +++ b/code/nel/include/nel/3d/fxaa.h @@ -46,6 +46,7 @@ namespace NL3D { class ITexture; class CTextureUser; class CPixelProgram; +class CVertexProgram; /** * \brief CFXAA @@ -66,8 +67,9 @@ private: UDriver *m_Driver; NL3D::UMaterial m_Mat; - NL3D::CVertexBuffer m_VB; + // NL3D::CVertexBuffer m_VB; NLMISC::CQuadUV m_QuadUV; + CVertexProgram *m_VP; CPixelProgram *m_PP; uint m_Width; diff --git a/code/nel/src/3d/bloom_effect.cpp b/code/nel/src/3d/bloom_effect.cpp index 842423661..fe82e43c6 100644 --- a/code/nel/src/3d/bloom_effect.cpp +++ b/code/nel/src/3d/bloom_effect.cpp @@ -264,6 +264,7 @@ void CBloomEffect::applyBloom() CRect rect1(0, 0, width, height); CRect rect2(0, 0, _BlurWidth, _BlurHeight); dru->stretchRect(_Scene, txt1, rect1, txt2, rect2); + _Driver->setMatrixMode2D11(); // horizontal blur pass doBlur(true); @@ -319,10 +320,7 @@ void CBloomEffect::applyBlur() matObjectFinal->texConstantColor(0, constCoeff); // display quad - UCamera pCam = _Scene->getCam(); - _Driver->setMatrixMode2D11(); _Driver->drawQuad(_BlurQuad, displayBlurMat); - _Driver->setMatrixMode3D(pCam); // disable vertex program drvInternal->activeVertexProgram(NULL); @@ -404,15 +402,12 @@ void CBloomEffect::doBlur(bool horizontalBlur) matObject->setTexture(3, startTexture); // display - UCamera pCam = _Scene->getCam(); - _Driver->setMatrixMode2D11(); _Driver->drawQuad(_BlurQuad, _BlurMat); // disable render target and vertex program drvInternal->activeVertexProgram(NULL); CTextureUser cu; ((CDriverUser *)_Driver)->setRenderTarget(cu, 0, 0, 0, 0); - _Driver->setMatrixMode3D(pCam); } }; // NL3D diff --git a/code/nel/src/3d/fxaa.cpp b/code/nel/src/3d/fxaa.cpp index d8a145fca..c1eb57607 100644 --- a/code/nel/src/3d/fxaa.cpp +++ b/code/nel/src/3d/fxaa.cpp @@ -54,7 +54,7 @@ namespace { } /* anonymous namespace */ -CFXAA::CFXAA(NL3D::UDriver *driver) : m_Driver(driver), m_PP(NULL), m_Width(~0), m_Height(~0) +CFXAA::CFXAA(NL3D::UDriver *driver) : m_Driver(driver), m_PP(NULL), m_VP(NULL), m_Width(~0), m_Height(~0) { nldebug("3D: Create FXAA"); @@ -82,14 +82,54 @@ CFXAA::CFXAA(NL3D::UDriver *driver) : m_Driver(driver), m_PP(NULL), m_Width(~0), } if (!drv->compilePixelProgram(m_PP)) { - nlwarning("No supported pixel program for FXAA effect"); + nlwarning("3D: No supported pixel program for FXAA effect"); delete m_PP; m_PP = NULL; } + else + { + nldebug("3D: FXAA pixel program available"); + } } - if (m_PP) + if (!m_PP) + { + return; + } + + // create vp + { + m_VP = new CVertexProgram(); + // nelvp + { + IProgram::CSource *source = new IProgram::CSource(); + source->Features.MaterialFlags = CProgramFeatures::TextureStages; + source->Profile = IProgram::nelvp; + source->setSourcePtr(a_nelvp); + m_VP->addSource(source); + } + if (!drv->compileVertexProgram(m_VP)) + { + nlwarning("3D: No supported vertex program for FXAA effect"); + + delete m_VP; + m_VP = NULL; + delete m_PP; + m_PP = NULL; + } + else + { + nldebug("3D: FXAA vertex program available"); + } + } + + if (!m_VP) + { + return; + } + + // create material and vb { m_Mat = m_Driver->createMaterial(); m_Mat.initUnlit(); @@ -113,28 +153,14 @@ CFXAA::CFXAA(NL3D::UDriver *driver) : m_Driver(driver), m_PP(NULL), m_Width(~0), m_QuadUV.Uv2 = CUV(1.f, 1.f); m_QuadUV.Uv3 = CUV(0.f, 1.f); - CVertexBuffer &vb = m_VB; + /*CVertexBuffer &vb = m_VB; vb.clearValueEx(); vb.addValueEx(CVertexBuffer::Position, CVertexBuffer::Float3); vb.addValueEx(CVertexBuffer::TexCoord0, CVertexBuffer::Float2); vb.addValueEx(CVertexBuffer::TexCoord1, CVertexBuffer::Float4); vb.initEx(); - vb.setPreferredMemory(CVertexBuffer::AGPPreferred, false); - vb.setNumVertices(4); - /*CVertexBufferReadWrite vba; - vb.lock(vba); - vba.setVertexCoord(0, 0.f, 0.f, 0.5f); - vba.setVertexCoord(1, 1.f, 0.f, 0.5f); - vba.setVertexCoord(2, 1.f, 1.f, 0.5f); - vba.setVertexCoord(3, 0.f, 1.f, 0.5f); - vba.setTexCoord(0, 0, 0.f, 0.f); - vba.setTexCoord(1, 0, 1.f, 0.f); - vba.setTexCoord(2, 0, 1.f, 1.f); - vba.setTexCoord(3, 0, 0.f, 1.f);*/ - /*vba.setTexCoord(0, 1, 0.f, 0.f); - vba.setTexCoord(1, 1, 1.f, 0.f); - vba.setTexCoord(2, 1, 1.f, 1.f); - vba.setTexCoord(3, 1, 0.f, 1.f);*/ + vb.setPreferredMemory(CVertexBuffer::RAMVolatile, false); + vb.setNumVertices(4);*/ } } @@ -147,6 +173,8 @@ CFXAA::~CFXAA() m_Driver->deleteMaterial(m_Mat); } + delete m_VP; + m_VP = NULL; delete m_PP; m_PP = NULL; @@ -172,6 +200,33 @@ void CFXAA::applyEffect() float fwidth = (float)width; float fheight = (float)height; + nldebug("%f, %f", fwidth, fheight); + float pwidth = 1.0f / fwidth; + float pheight = 1.0f / fheight; + float hpwidth = pwidth * 0.5f; + float hpheight = pheight * 0.5f; + float n = 0.5f; + + //if (width != m_Width || height != m_Height) + /*{ + // Build VB + m_Width = width; + m_Height = height; + CVertexBufferReadWrite vba; + m_VB.lock(vba); + vba.setValueFloat3Ex(CVertexBuffer::Position, 0, 0.f, 0.f, 0.5f); // BL + vba.setValueFloat3Ex(CVertexBuffer::Position, 1, 1.f, 0.f, 0.5f); // BR + vba.setValueFloat3Ex(CVertexBuffer::Position, 2, 1.f, 1.f, 0.5f); // TR + vba.setValueFloat3Ex(CVertexBuffer::Position, 3, 0.f, 1.f, 0.5f); // TL + vba.setValueFloat2Ex(CVertexBuffer::TexCoord0, 0, 0.f, 0.f); + vba.setValueFloat2Ex(CVertexBuffer::TexCoord0, 1, 1.f, 0.f); + vba.setValueFloat2Ex(CVertexBuffer::TexCoord0, 2, 1.f, 1.f); + vba.setValueFloat2Ex(CVertexBuffer::TexCoord0, 3, 0.f, 1.f); + vba.setValueFloat4Ex(CVertexBuffer::TexCoord1, 0, 0.f - hpwidth, 0.f - hpheight, 0.f + hpwidth, 0.f + hpheight); + vba.setValueFloat4Ex(CVertexBuffer::TexCoord1, 1, 1.f - hpwidth, 0.f - hpheight, 1.f + hpwidth, 0.f + hpheight); + vba.setValueFloat4Ex(CVertexBuffer::TexCoord1, 2, 1.f - hpwidth, 1.f - hpheight, 1.f + hpwidth, 1.f + hpheight); + vba.setValueFloat4Ex(CVertexBuffer::TexCoord1, 3, 0.f - hpwidth, 1.f - hpheight, 0.f + hpwidth, 1.f + hpheight); + }*/ // create render target CTextureUser *otherRenderTarget = m_Driver->getRenderTargetManager().getRenderTarget(width, height, mode2D); @@ -182,11 +237,30 @@ void CFXAA::applyEffect() drv->swapTextureHandle(*renderTarget, *otherRenderTarget->getITexture()); drv->setRenderTarget(renderTarget); + // debug + m_Driver->clearBuffers(CRGBA(128, 128, 128, 128)); + + // activate program + bool vpok = drv->activeVertexProgram(m_VP); + nlassert(vpok); + bool ppok = drv->activePixelProgram(m_PP); + nlassert(ppok); + drv->setUniform4f(IDriver::PixelProgram, 0, -n / fwidth, -n / fheight, n / fwidth, n / fheight); // fxaaConsoleRcpFrameOpt + drv->setUniform4f(IDriver::PixelProgram, 1, -2.0f / fwidth, -2.0f / fheight, 2.0f / fwidth, 2.0f / fheight); // fxaaConsoleRcpFrameOpt2 + drv->setUniformMatrix(IDriver::VertexProgram, 0, IDriver::ModelViewProjection, IDriver::Identity); + drv->setUniform4f(IDriver::VertexProgram, 9, -hpwidth, -hpheight, hpwidth, hpheight); + // render effect m_Mat.getObjectPtr()->setTexture(0, otherRenderTarget->getITexture()); + /*drv->activeVertexBuffer(m_VB); + drv->renderRawQuads(*m_Mat.getObjectPtr(), 0, 1);*/ m_Driver->drawQuad(m_QuadUV, m_Mat); m_Mat.getObjectPtr()->setTexture(0, NULL); + // deactivate program + drv->activeVertexProgram(NULL); + drv->activePixelProgram(NULL); + // recycle render target m_Driver->getRenderTargetManager().recycleRenderTarget(otherRenderTarget); } diff --git a/code/nel/src/3d/fxaa_program.h b/code/nel/src/3d/fxaa_program.h index 5a7f4a709..6002a09a4 100644 --- a/code/nel/src/3d/fxaa_program.h +++ b/code/nel/src/3d/fxaa_program.h @@ -1,6 +1,101 @@ +const char *a_nelvp = + "!!VP1.0\n" + "DP4 o[HPOS].x, c[0], v[OPOS];\n" + "DP4 o[HPOS].y, c[1], v[OPOS];\n" + "DP4 o[HPOS].z, c[2], v[OPOS];\n" + "DP4 o[HPOS].w, c[3], v[OPOS];\n" + "MOV o[TEX0].xy, v[TEX0];\n" + "ADD o[TEX1], v[TEX0].xyxy, c[9];\n" + "END\n"; + +const char *a_arbfp1_test = + "!!ARBfp1.0\n" + "OPTION ARB_precision_hint_fastest;\n" + "TEX result.color, fragment.texcoord[1].zwzw, texture[0], 2D;\n" + "END\n"; + const char *a_arbfp1 = "!!ARBfp1.0\n" + "OPTION ARB_precision_hint_fastest;\n" + //# cgc version 3.1.0013, build date Apr 18 2012 + //# command line args: -profile arbfp1 -O3 -fastmath -fastprecision + //# source file: fxaa_fp.cg + //#vendor NVIDIA Corporation + //#version 3.1.0.13 + //#profile arbfp1 + //#program fxaa_fp + //#semantic fxaa_fp.fxaaConsoleRcpFrameOpt + //#semantic fxaa_fp.fxaaConsoleRcpFrameOpt2 + //#semantic fxaa_fp.nlTex0 : TEX0 + //#var float2 pos : $vin.TEXCOORD0 : TEX0 : 0 : 1 + //#var float4 fxaaConsolePosPos : $vin.TEXCOORD1 : TEX1 : 1 : 1 + //#var float4 fxaaConsoleRcpFrameOpt : : c[0] : 2 : 1 + //#var float4 fxaaConsoleRcpFrameOpt2 : : c[1] : 3 : 1 + //#var sampler2D nlTex0 : TEX0 : texunit 0 : 4 : 1 + //#var float4 oCol : $vout.COLOR : COL : 5 : 1 + //#const c[2] = 0.125 0 -2 2 + //#const c[3] = 0.001953125 0.5 + "PARAM c[4] = { program.env[0..1],\n" + " { 0.125, 0, -2, 2 },\n" + " { 0.001953125, 0.5 } };\n" + "TEMP R0;\n" + "TEMP R1;\n" + "TEMP R2;\n" + "TEMP R3;\n" + "TEMP R4;\n" + "TEMP R5;\n" + "TEX R1.w, fragment.texcoord[1].zyzw, texture[0], 2D;\n" + "ADD R0.x, R1.w, c[3];\n" + "TEX R0.w, fragment.texcoord[1].xwzw, texture[0], 2D;\n" + "TEX R1.w, fragment.texcoord[1], texture[0], 2D;\n" + "ADD R0.y, -R0.x, R0.w;\n" + "ADD R0.z, R1.w, R0.y;\n" + "TEX R2.w, fragment.texcoord[1].zwzw, texture[0], 2D;\n" + "ADD R0.y, -R1.w, R0;\n" + "ADD R1.x, R2.w, R0.y;\n" + "ADD R1.y, R0.z, -R2.w;\n" + "MUL R2.xy, R1, R1;\n" + "ADD R0.y, R2.x, R2;\n" + "RSQ R0.y, R0.y;\n" + "MUL R2.xy, R0.y, R1;\n" + "MAD R3.xy, R2, c[0].zwzw, fragment.texcoord[0];\n" + "ABS R0.z, R2.y;\n" + "ABS R0.y, R2.x;\n" + "MIN R0.y, R0, R0.z;\n" + "RCP R0.y, R0.y;\n" + "MUL R1.xy, R0.y, R2;\n" + "MUL R1.xy, R1, c[2].x;\n" + "MIN R1.xy, R1, c[2].w;\n" + "TEX R4, R3, texture[0], 2D;\n" + "MAD R2.xy, -R2, c[0].zwzw, fragment.texcoord[0];\n" + "TEX R3, R2, texture[0], 2D;\n" + "ADD R3, R3, R4;\n" + "MAX R1.xy, R1, c[2].z;\n" + "MAD R2.xy, R1, c[1].zwzw, fragment.texcoord[0];\n" + "MUL R5, R3, c[3].y;\n" + "MAD R1.xy, -R1, c[1].zwzw, fragment.texcoord[0];\n" + "MIN R0.z, R0.x, R2.w;\n" + "MIN R0.y, R0.w, R1.w;\n" + "MIN R0.y, R0, R0.z;\n" + "MAX R0.z, R0.x, R2.w;\n" + "MAX R0.x, R0.w, R1.w;\n" + "MAX R0.x, R0, R0.z;\n" + "TEX R4, R2, texture[0], 2D;\n" + "TEX R3, R1, texture[0], 2D;\n" + "ADD R3, R3, R4;\n" + "MAD R3, R3, c[3].y, R5;\n" + "MUL R3, R3, c[3].y;\n" + "SLT R0.z, R0.x, R3.w;\n" + "SLT R0.x, R3.w, R0.y;\n" + "ADD_SAT R0.x, R0, R0.z;\n" + "CMP result.color, -R0.x, R5, R3;\n" + "END\n"; + //# 45 instructions, 6 R-regs + +const char *a_arbfp1_earlyexit = + "!!ARBfp1.0\n" + "OPTION ARB_precision_hint_fastest;\n" //"# cgc version 3.1.0013, build date Apr 18 2012\n" //"# command line args: -profile arbfp1\n" //"# source file: fxaa_fp.cg\n" @@ -82,7 +177,127 @@ const char *a_arbfp1 = "END\n"; //"# 51 instructions, 6 R-regs\n" +const char *a_ps_2_0_test_t0 = + "ps_2_x\n" + "dcl_2d s0\n" + "dcl t0.xyz\n" + "mov r0.xy, t0.xy\n" + "texld r0, r0, s0\n" + "mov oC0, r0\n"; + +const char *a_ps_2_0_test_avg = + "ps_2_x\n" + "dcl_2d s0\n" + "def c0, 0.25000000, 0, 0, 0\n" + "dcl t1\n" + "mov r0.xy, t1.xwzw\n" + "mov r1.xy, t1.zyzw\n" + "texld r0, r0, s0\n" + "texld r1, r1, s0\n" + "add r2, r1, r0\n" + "mov r0.xy, t1.zwzw\n" + "texld r1, t1, s0\n" + "texld r0, r0, s0\n" + "add r1, r2, r1\n" + "add r0, r1, r0\n" + "mul r0, r0, c0.x\n" + "mov oC0, r0\n"; + const char *a_ps_2_0 = + "ps_2_0\n" + // cgc version 3.1.0013, build date Apr 18 2012 + // command line args: -profile ps_2_0 -O3 -fastmath -fastprecision + // source file: fxaa_pp.cg + //vendor NVIDIA Corporation + //version 3.1.0.13 + //profile ps_2_0 + //program fxaa_pp + //semantic fxaa_pp.fxaaConsoleRcpFrameOpt + //semantic fxaa_pp.fxaaConsoleRcpFrameOpt2 + //semantic fxaa_pp.nlTex0 : TEX0 + //var float2 pos : $vin.TEXCOORD0 : TEX0 : 0 : 1 + //var float4 fxaaConsolePosPos : $vin.TEXCOORD1 : TEX1 : 1 : 1 + //var float4 fxaaConsoleRcpFrameOpt : : c[0] : 2 : 1 + //var float4 fxaaConsoleRcpFrameOpt2 : : c[1] : 3 : 1 + //var sampler2D nlTex0 : TEX0 : texunit 0 : 4 : 1 + //var float4 oCol : $vout.COLOR : COL : 5 : 1 + //const c[2] = 0.001953125 0.125 2 -2 + //const c[3] = 0.5 0 1 + "dcl_2d s0\n" + "def c2, 0.00195313, 0.12500000, 2.00000000, -2.00000000\n" + "def c3, 0.50000000, 0.00000000, 1.00000000, 0\n" + "dcl t1\n" + "dcl t0.xy\n" + "texld r5, t1, s0\n" + "mov r1.y, t1.w\n" + "mov r1.x, t1.z\n" + "mov r2.xy, r1\n" + "mov r0.y, t1.w\n" + "mov r0.x, t1\n" + "mov r1.y, t1\n" + "mov r1.x, t1.z\n" + "texld r1, r1, s0\n" + "texld r0, r0, s0\n" + "texld r6, r2, s0\n" + "add r0.x, r1.w, c2\n" + "add r2.x, -r0, r0.w\n" + "add r1.x, r5.w, r2\n" + "add r2.z, r1.x, -r6.w\n" + "add r2.x, -r5.w, r2\n" + "add r2.x, r6.w, r2\n" + "mov r3.x, r2\n" + "mov r3.y, r2.z\n" + "mov r2.y, r2.z\n" + "mov r1.y, r2.z\n" + "mov r1.x, r2\n" + "mul r1.xy, r3, r1\n" + "add r1.x, r1, r1.y\n" + "rsq r1.x, r1.x\n" + "mul r4.xy, r1.x, r2\n" + "abs r2.x, r4.y\n" + "abs r1.x, r4\n" + "min r1.x, r1, r2\n" + "rcp r1.x, r1.x\n" + "mul r1.xy, r1.x, r4\n" + "mul r1.xy, r1, c2.y\n" + "min r1.xy, r1, c2.z\n" + "max r2.xy, r1, c2.w\n" + "mov r1.y, c1.w\n" + "mov r1.x, c1.z\n" + "mad r3.xy, r2, r1, t0\n" + "mov r1.y, c1.w\n" + "mov r1.x, c1.z\n" + "mad r5.xy, -r2, r1, t0\n" + "mov r1.y, c0.w\n" + "mov r1.x, c0.z\n" + "mad r2.xy, -r4, r1, t0\n" + "mov r1.y, c0.w\n" + "mov r1.x, c0.z\n" + "mad r1.xy, r4, r1, t0\n" + "texld r4, r5, s0\n" + "texld r3, r3, s0\n" + "texld r1, r1, s0\n" + "texld r2, r2, s0\n" + "add r1, r2, r1\n" + "mul r2, r1, c3.x\n" + "add r1, r4, r3\n" + "max r3.x, r0, r6.w\n" + "mad r1, r1, c3.x, r2\n" + "mul r4, r1, c3.x\n" + "max r1.x, r0.w, r5.w\n" + "max r1.x, r1, r3\n" + "add r1.x, -r4.w, r1\n" + "min r3.x, r0.w, r5.w\n" + "min r0.x, r0, r6.w\n" + "min r0.x, r3, r0\n" + "add r0.x, r4.w, -r0\n" + "cmp r1.x, r1, c3.y, c3.z\n" + "cmp r0.x, r0, c3.y, c3.z\n" + "add_pp_sat r0.x, r0, r1\n" + "cmp r0, -r0.x, r4, r2\n" + "mov oC0, r0\n"; + +const char *a_ps_2_0_earlyexit = "ps_2_x\n" // cgc version 3.1.0013, build date Apr 18 2012 // command line args: -profile ps_2_x diff --git a/code/nel/src/3d/shaders/compile.bat b/code/nel/src/3d/shaders/compile.bat new file mode 100644 index 000000000..a1d660d9d --- /dev/null +++ b/code/nel/src/3d/shaders/compile.bat @@ -0,0 +1,3 @@ +cgc -entry fxaa_pp fxaa_pp.cg -profile arbfp1 -O3 -fastmath -fastprecision -o fxaa_pp_arbfp1.txt +cgc -entry fxaa_pp fxaa_pp.cg -profile ps_2_0 -O3 -fastmath -fastprecision -o fxaa_pp_ps_2_0.txt +cgc -entry fxaa_vp fxaa_vp.cg -profile arbvp1 -fastmath -fastprecision -o fxaa_vp_arbvp1.txt \ No newline at end of file diff --git a/code/nel/src/3d/shaders/fxaa3_11.h b/code/nel/src/3d/shaders/fxaa3_11.h new file mode 100644 index 000000000..0443fd6e2 --- /dev/null +++ b/code/nel/src/3d/shaders/fxaa3_11.h @@ -0,0 +1,2042 @@ +/*============================================================================ + + +NVIDIA FXAA 3.11 by TIMOTHY LOTTES + + +------------------------------------------------------------------------------ +COPYRIGHT (C) 2010, 2011 NVIDIA CORPORATION. ALL RIGHTS RESERVED. +------------------------------------------------------------------------------ +TO THE MAXIMUM EXTENT PERMITTED BY APPLICABLE LAW, THIS SOFTWARE IS PROVIDED +*AS IS* AND NVIDIA AND ITS SUPPLIERS DISCLAIM ALL WARRANTIES, EITHER EXPRESS +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT SHALL NVIDIA +OR ITS SUPPLIERS BE LIABLE FOR ANY SPECIAL, INCIDENTAL, INDIRECT, OR +CONSEQUENTIAL DAMAGES WHATSOEVER (INCLUDING, WITHOUT LIMITATION, DAMAGES FOR +LOSS OF BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS INFORMATION, +OR ANY OTHER PECUNIARY LOSS) ARISING OUT OF THE USE OF OR INABILITY TO USE +THIS SOFTWARE, EVEN IF NVIDIA HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH +DAMAGES. + +------------------------------------------------------------------------------ +INTEGRATION CHECKLIST +------------------------------------------------------------------------------ +(1.) +In the shader source, setup defines for the desired configuration. +When providing multiple shaders (for different presets), +simply setup the defines differently in multiple files. +Example, + +#define FXAA_PC 1 +#define FXAA_HLSL_5 1 +#define FXAA_QUALITY__PRESET 12 + +Or, + +#define FXAA_360 1 +Or, + +#define FXAA_PS3 1 +Etc. + +(2.) +Then include this file, + +#include "Fxaa3_11.h" + +(3.) +Then call the FXAA pixel shader from within your desired shader. +Look at the FXAA Quality FxaaPixelShader() for docs on inputs. +As for FXAA 3.11 all inputs for all shaders are the same +to enable easy porting between platforms. + +return FxaaPixelShader(...); + +(4.) +Insure pass prior to FXAA outputs RGBL (see next section). +Or use, + +#define FXAA_GREEN_AS_LUMA 1 + +(5.) +Setup engine to provide the following constants +which are used in the FxaaPixelShader() inputs, + +FxaaFloat2 fxaaQualityRcpFrame, +FxaaFloat4 fxaaConsoleRcpFrameOpt, +FxaaFloat4 fxaaConsoleRcpFrameOpt2, +FxaaFloat4 fxaaConsole360RcpFrameOpt2, +FxaaFloat fxaaQualitySubpix, +FxaaFloat fxaaQualityEdgeThreshold, +FxaaFloat fxaaQualityEdgeThresholdMin, +FxaaFloat fxaaConsoleEdgeSharpness, +FxaaFloat fxaaConsoleEdgeThreshold, +FxaaFloat fxaaConsoleEdgeThresholdMin, +FxaaFloat4 fxaaConsole360ConstDir + +Look at the FXAA Quality FxaaPixelShader() for docs on inputs. + +(6.) +Have FXAA vertex shader run as a full screen triangle, +and output "pos" and "fxaaConsolePosPos" +such that inputs in the pixel shader provide, + +// {xy} = center of pixel +FxaaFloat2 pos, + +// {xy__} = upper left of pixel +// {__zw} = lower right of pixel +FxaaFloat4 fxaaConsolePosPos, + +(7.) +Insure the texture sampler(s) used by FXAA are set to bilinear filtering. + + +------------------------------------------------------------------------------ +INTEGRATION - RGBL AND COLORSPACE +------------------------------------------------------------------------------ +FXAA3 requires RGBL as input unless the following is set, + +#define FXAA_GREEN_AS_LUMA 1 + +In which case the engine uses green in place of luma, +and requires RGB input is in a non-linear colorspace. + +RGB should be LDR (low dynamic range). +Specifically do FXAA after tonemapping. + +RGB data as returned by a texture fetch can be non-linear, +or linear when FXAA_GREEN_AS_LUMA is not set. +Note an "sRGB format" texture counts as linear, +because the result of a texture fetch is linear data. +Regular "RGBA8" textures in the sRGB colorspace are non-linear. + +If FXAA_GREEN_AS_LUMA is not set, +luma must be stored in the alpha channel prior to running FXAA. +This luma should be in a perceptual space (could be gamma 2.0). +Example pass before FXAA where output is gamma 2.0 encoded, + +color.rgb = ToneMap(color.rgb); // linear color output +color.rgb = sqrt(color.rgb); // gamma 2.0 color output +return color; + +To use FXAA, + +color.rgb = ToneMap(color.rgb); // linear color output +color.rgb = sqrt(color.rgb); // gamma 2.0 color output +color.a = dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114)); // compute luma +return color; + +Another example where output is linear encoded, +say for instance writing to an sRGB formated render target, +where the render target does the conversion back to sRGB after blending, + +color.rgb = ToneMap(color.rgb); // linear color output +return color; + +To use FXAA, + +color.rgb = ToneMap(color.rgb); // linear color output +color.a = sqrt(dot(color.rgb, FxaaFloat3(0.299, 0.587, 0.114))); // compute luma +return color; + +Getting luma correct is required for the algorithm to work correctly. + + +------------------------------------------------------------------------------ +BEING LINEARLY CORRECT? +------------------------------------------------------------------------------ +Applying FXAA to a framebuffer with linear RGB color will look worse. +This is very counter intuitive, but happends to be true in this case. +The reason is because dithering artifacts will be more visiable +in a linear colorspace. + + +------------------------------------------------------------------------------ +COMPLEX INTEGRATION +------------------------------------------------------------------------------ +Q. What if the engine is blending into RGB before wanting to run FXAA? + +A. In the last opaque pass prior to FXAA, +have the pass write out luma into alpha. +Then blend into RGB only. +FXAA should be able to run ok +assuming the blending pass did not any add aliasing. +This should be the common case for particles and common blending passes. + +A. Or use FXAA_GREEN_AS_LUMA. + +============================================================================*/ + +/*============================================================================ + +INTEGRATION KNOBS + +============================================================================*/ +// +// FXAA_PS3 and FXAA_360 choose the console algorithm (FXAA3 CONSOLE). +// FXAA_360_OPT is a prototype for the new optimized 360 version. +// +// 1 = Use API. +// 0 = Don't use API. +// +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_PS3 +#define FXAA_PS3 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_360 +#define FXAA_360 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_360_OPT +#define FXAA_360_OPT 0 +#endif +/*==========================================================================*/ +#ifndef FXAA_PC +// +// FXAA Quality +// The high quality PC algorithm. +// +#define FXAA_PC 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_PC_CONSOLE +// +// The console algorithm for PC is included +// for developers targeting really low spec machines. +// Likely better to just run FXAA_PC, and use a really low preset. +// +#define FXAA_PC_CONSOLE 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_120 +#define FXAA_GLSL_120 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GLSL_130 +#define FXAA_GLSL_130 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_HLSL_3 +#define FXAA_HLSL_3 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_HLSL_4 +#define FXAA_HLSL_4 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_HLSL_5 +#define FXAA_HLSL_5 0 +#endif +/*==========================================================================*/ +#ifndef FXAA_GREEN_AS_LUMA +// +// For those using non-linear color, +// and either not able to get luma in alpha, or not wanting to, +// this enables FXAA to run using green as a proxy for luma. +// So with this enabled, no need to pack luma in alpha. +// +// This will turn off AA on anything which lacks some amount of green. +// Pure red and blue or combination of only R and B, will get no AA. +// +// Might want to lower the settings for both, +// fxaaConsoleEdgeThresholdMin +// fxaaQualityEdgeThresholdMin +// In order to insure AA does not get turned off on colors +// which contain a minor amount of green. +// +// 1 = On. +// 0 = Off. +// +#define FXAA_GREEN_AS_LUMA 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_EARLY_EXIT +// +// Controls algorithm's early exit path. +// On PS3 turning this ON adds 2 cycles to the shader. +// On 360 turning this OFF adds 10ths of a millisecond to the shader. +// Turning this off on console will result in a more blurry image. +// So this defaults to on. +// +// 1 = On. +// 0 = Off. +// +#define FXAA_EARLY_EXIT 1 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_DISCARD +// +// Only valid for PC OpenGL currently. +// Probably will not work when FXAA_GREEN_AS_LUMA = 1. +// +// 1 = Use discard on pixels which don't need AA. +// For APIs which enable concurrent TEX+ROP from same surface. +// 0 = Return unchanged color on pixels which don't need AA. +// +#define FXAA_DISCARD 0 +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_FAST_PIXEL_OFFSET +// +// Used for GLSL 120 only. +// +// 1 = GL API supports fast pixel offsets +// 0 = do not use fast pixel offsets +// +#ifdef GL_EXT_gpu_shader4 +#define FXAA_FAST_PIXEL_OFFSET 1 +#endif +#ifdef GL_NV_gpu_shader5 +#define FXAA_FAST_PIXEL_OFFSET 1 +#endif +#ifdef GL_ARB_gpu_shader5 +#define FXAA_FAST_PIXEL_OFFSET 1 +#endif +#ifndef FXAA_FAST_PIXEL_OFFSET +#define FXAA_FAST_PIXEL_OFFSET 0 +#endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_GATHER4_ALPHA +// +// 1 = API supports gather4 on alpha channel. +// 0 = API does not support gather4 on alpha channel. +// +#if (FXAA_HLSL_5 == 1) +#define FXAA_GATHER4_ALPHA 1 +#endif +#ifdef GL_ARB_gpu_shader5 +#define FXAA_GATHER4_ALPHA 1 +#endif +#ifdef GL_NV_gpu_shader5 +#define FXAA_GATHER4_ALPHA 1 +#endif +#ifndef FXAA_GATHER4_ALPHA +#define FXAA_GATHER4_ALPHA 0 +#endif +#endif + +/*============================================================================ +FXAA CONSOLE PS3 - TUNING KNOBS +============================================================================*/ +#ifndef FXAA_CONSOLE__PS3_EDGE_SHARPNESS +// +// Consoles the sharpness of edges on PS3 only. +// Non-PS3 tuning is done with shader input. +// +// Due to the PS3 being ALU bound, +// there are only two safe values here: 4 and 8. +// These options use the shaders ability to a free *|/ by 2|4|8. +// +// 8.0 is sharper +// 4.0 is softer +// 2.0 is really soft (good for vector graphics inputs) +// +#if 1 +#define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 8.0 +#endif +#if 0 +#define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 4.0 +#endif +#if 0 +#define FXAA_CONSOLE__PS3_EDGE_SHARPNESS 2.0 +#endif +#endif +/*--------------------------------------------------------------------------*/ +#ifndef FXAA_CONSOLE__PS3_EDGE_THRESHOLD +// +// Only effects PS3. +// Non-PS3 tuning is done with shader input. +// +// The minimum amount of local contrast required to apply algorithm. +// The console setting has a different mapping than the quality setting. +// +// This only applies when FXAA_EARLY_EXIT is 1. +// +// Due to the PS3 being ALU bound, +// there are only two safe values here: 0.25 and 0.125. +// These options use the shaders ability to a free *|/ by 2|4|8. +// +// 0.125 leaves less aliasing, but is softer +// 0.25 leaves more aliasing, and is sharper +// +#if 1 +#define FXAA_CONSOLE__PS3_EDGE_THRESHOLD 0.125 +#else +#define FXAA_CONSOLE__PS3_EDGE_THRESHOLD 0.25 +#endif +#endif + +/*============================================================================ +FXAA QUALITY - TUNING KNOBS +------------------------------------------------------------------------------ +NOTE the other tuning knobs are now in the shader function inputs! +============================================================================*/ +#ifndef FXAA_QUALITY__PRESET +// +// Choose the quality preset. +// This needs to be compiled into the shader as it effects code. +// Best option to include multiple presets is to +// in each shader define the preset, then include this file. +// +// OPTIONS +// ----------------------------------------------------------------------- +// 10 to 15 - default medium dither (10=fastest, 15=highest quality) +// 20 to 29 - less dither, more expensive (20=fastest, 29=highest quality) +// 39 - no dither, very expensive +// +// NOTES +// ----------------------------------------------------------------------- +// 12 = slightly faster then FXAA 3.9 and higher edge quality (default) +// 13 = about same speed as FXAA 3.9 and better than 12 +// 23 = closest to FXAA 3.9 visually and performance wise +// _ = the lowest digit is directly related to performance +// _ = the highest digit is directly related to style +// +#define FXAA_QUALITY__PRESET 12 +#endif + + +/*============================================================================ + +FXAA QUALITY - PRESETS + +============================================================================*/ + +/*============================================================================ +FXAA QUALITY - MEDIUM DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 10) +#define FXAA_QUALITY__PS 3 +#define FXAA_QUALITY__P0 1.5 +#define FXAA_QUALITY__P1 3.0 +#define FXAA_QUALITY__P2 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 11) +#define FXAA_QUALITY__PS 4 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 3.0 +#define FXAA_QUALITY__P3 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 12) +#define FXAA_QUALITY__PS 5 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 4.0 +#define FXAA_QUALITY__P4 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 13) +#define FXAA_QUALITY__PS 6 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 4.0 +#define FXAA_QUALITY__P5 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 14) +#define FXAA_QUALITY__PS 7 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 4.0 +#define FXAA_QUALITY__P6 12.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 15) +#define FXAA_QUALITY__PS 8 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 2.0 +#define FXAA_QUALITY__P6 4.0 +#define FXAA_QUALITY__P7 12.0 +#endif + +/*============================================================================ +FXAA QUALITY - LOW DITHER PRESETS +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 20) +#define FXAA_QUALITY__PS 3 +#define FXAA_QUALITY__P0 1.5 +#define FXAA_QUALITY__P1 2.0 +#define FXAA_QUALITY__P2 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 21) +#define FXAA_QUALITY__PS 4 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 22) +#define FXAA_QUALITY__PS 5 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 23) +#define FXAA_QUALITY__PS 6 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 24) +#define FXAA_QUALITY__PS 7 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 3.0 +#define FXAA_QUALITY__P6 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 25) +#define FXAA_QUALITY__PS 8 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 2.0 +#define FXAA_QUALITY__P6 4.0 +#define FXAA_QUALITY__P7 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 26) +#define FXAA_QUALITY__PS 9 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 2.0 +#define FXAA_QUALITY__P6 2.0 +#define FXAA_QUALITY__P7 4.0 +#define FXAA_QUALITY__P8 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 27) +#define FXAA_QUALITY__PS 10 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 2.0 +#define FXAA_QUALITY__P6 2.0 +#define FXAA_QUALITY__P7 2.0 +#define FXAA_QUALITY__P8 4.0 +#define FXAA_QUALITY__P9 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 28) +#define FXAA_QUALITY__PS 11 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 2.0 +#define FXAA_QUALITY__P6 2.0 +#define FXAA_QUALITY__P7 2.0 +#define FXAA_QUALITY__P8 2.0 +#define FXAA_QUALITY__P9 4.0 +#define FXAA_QUALITY__P10 8.0 +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PRESET == 29) +#define FXAA_QUALITY__PS 12 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.5 +#define FXAA_QUALITY__P2 2.0 +#define FXAA_QUALITY__P3 2.0 +#define FXAA_QUALITY__P4 2.0 +#define FXAA_QUALITY__P5 2.0 +#define FXAA_QUALITY__P6 2.0 +#define FXAA_QUALITY__P7 2.0 +#define FXAA_QUALITY__P8 2.0 +#define FXAA_QUALITY__P9 2.0 +#define FXAA_QUALITY__P10 4.0 +#define FXAA_QUALITY__P11 8.0 +#endif + +/*============================================================================ +FXAA QUALITY - EXTREME QUALITY +============================================================================*/ +#if (FXAA_QUALITY__PRESET == 39) +#define FXAA_QUALITY__PS 12 +#define FXAA_QUALITY__P0 1.0 +#define FXAA_QUALITY__P1 1.0 +#define FXAA_QUALITY__P2 1.0 +#define FXAA_QUALITY__P3 1.0 +#define FXAA_QUALITY__P4 1.0 +#define FXAA_QUALITY__P5 1.5 +#define FXAA_QUALITY__P6 2.0 +#define FXAA_QUALITY__P7 2.0 +#define FXAA_QUALITY__P8 2.0 +#define FXAA_QUALITY__P9 2.0 +#define FXAA_QUALITY__P10 4.0 +#define FXAA_QUALITY__P11 8.0 +#endif + + + +/*============================================================================ + +API PORTING + +============================================================================*/ +#if (FXAA_GLSL_120 == 1) || (FXAA_GLSL_130 == 1) +#define FxaaBool bool +#define FxaaDiscard discard +#define FxaaFloat float +#define FxaaFloat2 vec2 +#define FxaaFloat3 vec3 +#define FxaaFloat4 vec4 +#define FxaaHalf float +#define FxaaHalf2 vec2 +#define FxaaHalf3 vec3 +#define FxaaHalf4 vec4 +#define FxaaInt2 ivec2 +#define FxaaSat(x) clamp(x, 0.0, 1.0) +#define FxaaTex sampler2D +#else +#define FxaaBool bool +#define FxaaDiscard clip(-1) +#define FxaaFloat float +#define FxaaFloat2 float2 +#define FxaaFloat3 float3 +#define FxaaFloat4 float4 +#define FxaaHalf half +#define FxaaHalf2 half2 +#define FxaaHalf3 half3 +#define FxaaHalf4 half4 +#define FxaaSat(x) saturate(x) +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GLSL_120 == 1) +// Requires, +// #version 120 +// And at least, +// #extension GL_EXT_gpu_shader4 : enable +// (or set FXAA_FAST_PIXEL_OFFSET 1 to work like DX9) +#define FxaaTexTop(t, p) texture2DLod(t, p, 0.0) +#if (FXAA_FAST_PIXEL_OFFSET == 1) +#define FxaaTexOff(t, p, o, r) texture2DLodOffset(t, p, 0.0, o) +#else +#define FxaaTexOff(t, p, o, r) texture2DLod(t, p + (o * r), 0.0) +#endif +#if (FXAA_GATHER4_ALPHA == 1) +// use #extension GL_ARB_gpu_shader5 : enable +#define FxaaTexAlpha4(t, p) textureGather(t, p, 3) +#define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) +#define FxaaTexGreen4(t, p) textureGather(t, p, 1) +#define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) +#endif +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GLSL_130 == 1) +// Requires "#version 130" or better +#define FxaaTexTop(t, p) textureLod(t, p, 0.0) +#define FxaaTexOff(t, p, o, r) textureLodOffset(t, p, 0.0, o) +#if (FXAA_GATHER4_ALPHA == 1) +// use #extension GL_ARB_gpu_shader5 : enable +#define FxaaTexAlpha4(t, p) textureGather(t, p, 3) +#define FxaaTexOffAlpha4(t, p, o) textureGatherOffset(t, p, o, 3) +#define FxaaTexGreen4(t, p) textureGather(t, p, 1) +#define FxaaTexOffGreen4(t, p, o) textureGatherOffset(t, p, o, 1) +#endif +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_HLSL_3 == 1) || (FXAA_360 == 1) || (FXAA_PS3 == 1) +#define FxaaInt2 float2 +#define FxaaTex sampler2D +#define FxaaTexTop(t, p) tex2Dlod(t, float4(p, 0.0, 0.0)) +#define FxaaTexOff(t, p, o, r) tex2Dlod(t, float4(p + (o * r), 0, 0)) +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_HLSL_4 == 1) +#define FxaaInt2 int2 +struct FxaaTex { SamplerState smpl; Texture2D tex; }; +#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0) +#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o) +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_HLSL_5 == 1) +#define FxaaInt2 int2 +struct FxaaTex { SamplerState smpl; Texture2D tex; }; +#define FxaaTexTop(t, p) t.tex.SampleLevel(t.smpl, p, 0.0) +#define FxaaTexOff(t, p, o, r) t.tex.SampleLevel(t.smpl, p, 0.0, o) +#define FxaaTexAlpha4(t, p) t.tex.GatherAlpha(t.smpl, p) +#define FxaaTexOffAlpha4(t, p, o) t.tex.GatherAlpha(t.smpl, p, o) +#define FxaaTexGreen4(t, p) t.tex.GatherGreen(t.smpl, p) +#define FxaaTexOffGreen4(t, p, o) t.tex.GatherGreen(t.smpl, p, o) +#endif + + +/*============================================================================ +GREEN AS LUMA OPTION SUPPORT FUNCTION +============================================================================*/ +#if (FXAA_GREEN_AS_LUMA == 0) +FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.w; } +#else +FxaaFloat FxaaLuma(FxaaFloat4 rgba) { return rgba.y; } +#endif + + + + +/*============================================================================ + +FXAA3 QUALITY - PC + +============================================================================*/ +#if (FXAA_PC == 1) +/*--------------------------------------------------------------------------*/ +FxaaFloat4 FxaaPixelShader( +// +// Use noperspective interpolation here (turn off perspective interpolation). +// {xy} = center of pixel +FxaaFloat2 pos, +// +// Used only for FXAA Console, and not used on the 360 version. +// Use noperspective interpolation here (turn off perspective interpolation). +// {xy__} = upper left of pixel +// {__zw} = lower right of pixel +FxaaFloat4 fxaaConsolePosPos, +// +// Input color texture. +// {rgb_} = color in linear or perceptual color space +// if (FXAA_GREEN_AS_LUMA == 0) +// {___a} = luma in perceptual color space (not linear) +FxaaTex tex, +// +// Only used on the optimized 360 version of FXAA Console. +// For everything but 360, just use the same input here as for "tex". +// For 360, same texture, just alias with a 2nd sampler. +// This sampler needs to have an exponent bias of -1. +FxaaTex fxaaConsole360TexExpBiasNegOne, +// +// Only used on the optimized 360 version of FXAA Console. +// For everything but 360, just use the same input here as for "tex". +// For 360, same texture, just alias with a 3nd sampler. +// This sampler needs to have an exponent bias of -2. +FxaaTex fxaaConsole360TexExpBiasNegTwo, +// +// Only used on FXAA Quality. +// This must be from a constant/uniform. +// {x_} = 1.0/screenWidthInPixels +// {_y} = 1.0/screenHeightInPixels +FxaaFloat2 fxaaQualityRcpFrame, +// +// Only used on FXAA Console. +// This must be from a constant/uniform. +// This effects sub-pixel AA quality and inversely sharpness. +// Where N ranges between, +// N = 0.50 (default) +// N = 0.33 (sharper) +// {x___} = -N/screenWidthInPixels +// {_y__} = -N/screenHeightInPixels +// {__z_} = N/screenWidthInPixels +// {___w} = N/screenHeightInPixels +FxaaFloat4 fxaaConsoleRcpFrameOpt, +// +// Only used on FXAA Console. +// Not used on 360, but used on PS3 and PC. +// This must be from a constant/uniform. +// {x___} = -2.0/screenWidthInPixels +// {_y__} = -2.0/screenHeightInPixels +// {__z_} = 2.0/screenWidthInPixels +// {___w} = 2.0/screenHeightInPixels +FxaaFloat4 fxaaConsoleRcpFrameOpt2, +// +// Only used on FXAA Console. +// Only used on 360 in place of fxaaConsoleRcpFrameOpt2. +// This must be from a constant/uniform. +// {x___} = 8.0/screenWidthInPixels +// {_y__} = 8.0/screenHeightInPixels +// {__z_} = -4.0/screenWidthInPixels +// {___w} = -4.0/screenHeightInPixels +FxaaFloat4 fxaaConsole360RcpFrameOpt2, +// +// Only used on FXAA Quality. +// This used to be the FXAA_QUALITY__SUBPIX define. +// It is here now to allow easier tuning. +// Choose the amount of sub-pixel aliasing removal. +// This can effect sharpness. +// 1.00 - upper limit (softer) +// 0.75 - default amount of filtering +// 0.50 - lower limit (sharper, less sub-pixel aliasing removal) +// 0.25 - almost off +// 0.00 - completely off +FxaaFloat fxaaQualitySubpix, +// +// Only used on FXAA Quality. +// This used to be the FXAA_QUALITY__EDGE_THRESHOLD define. +// It is here now to allow easier tuning. +// The minimum amount of local contrast required to apply algorithm. +// 0.333 - too little (faster) +// 0.250 - low quality +// 0.166 - default +// 0.125 - high quality +// 0.063 - overkill (slower) +FxaaFloat fxaaQualityEdgeThreshold, +// +// Only used on FXAA Quality. +// This used to be the FXAA_QUALITY__EDGE_THRESHOLD_MIN define. +// It is here now to allow easier tuning. +// Trims the algorithm from processing darks. +// 0.0833 - upper limit (default, the start of visible unfiltered edges) +// 0.0625 - high quality (faster) +// 0.0312 - visible limit (slower) +// Special notes when using FXAA_GREEN_AS_LUMA, +// Likely want to set this to zero. +// As colors that are mostly not-green +// will appear very dark in the green channel! +// Tune by looking at mostly non-green content, +// then start at zero and increase until aliasing is a problem. +FxaaFloat fxaaQualityEdgeThresholdMin, +// +// Only used on FXAA Console. +// This used to be the FXAA_CONSOLE__EDGE_SHARPNESS define. +// It is here now to allow easier tuning. +// This does not effect PS3, as this needs to be compiled in. +// Use FXAA_CONSOLE__PS3_EDGE_SHARPNESS for PS3. +// Due to the PS3 being ALU bound, +// there are only three safe values here: 2 and 4 and 8. +// These options use the shaders ability to a free *|/ by 2|4|8. +// For all other platforms can be a non-power of two. +// 8.0 is sharper (default!!!) +// 4.0 is softer +// 2.0 is really soft (good only for vector graphics inputs) +FxaaFloat fxaaConsoleEdgeSharpness, +// +// Only used on FXAA Console. +// This used to be the FXAA_CONSOLE__EDGE_THRESHOLD define. +// It is here now to allow easier tuning. +// This does not effect PS3, as this needs to be compiled in. +// Use FXAA_CONSOLE__PS3_EDGE_THRESHOLD for PS3. +// Due to the PS3 being ALU bound, +// there are only two safe values here: 1/4 and 1/8. +// These options use the shaders ability to a free *|/ by 2|4|8. +// The console setting has a different mapping than the quality setting. +// Other platforms can use other values. +// 0.125 leaves less aliasing, but is softer (default!!!) +// 0.25 leaves more aliasing, and is sharper +FxaaFloat fxaaConsoleEdgeThreshold, +// +// Only used on FXAA Console. +// This used to be the FXAA_CONSOLE__EDGE_THRESHOLD_MIN define. +// It is here now to allow easier tuning. +// Trims the algorithm from processing darks. +// The console setting has a different mapping than the quality setting. +// This only applies when FXAA_EARLY_EXIT is 1. +// This does not apply to PS3, +// PS3 was simplified to avoid more shader instructions. +// 0.06 - faster but more aliasing in darks +// 0.05 - default +// 0.04 - slower and less aliasing in darks +// Special notes when using FXAA_GREEN_AS_LUMA, +// Likely want to set this to zero. +// As colors that are mostly not-green +// will appear very dark in the green channel! +// Tune by looking at mostly non-green content, +// then start at zero and increase until aliasing is a problem. +FxaaFloat fxaaConsoleEdgeThresholdMin, +// +// Extra constants for 360 FXAA Console only. +// Use zeros or anything else for other platforms. +// These must be in physical constant registers and NOT immedates. +// Immedates will result in compiler un-optimizing. +// {xyzw} = float4(1.0, -1.0, 0.25, -0.25) +FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ +FxaaFloat2 posM; +posM.x = pos.x; +posM.y = pos.y; +#if (FXAA_GATHER4_ALPHA == 1) +#if (FXAA_DISCARD == 0) +FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); +#if (FXAA_GREEN_AS_LUMA == 0) +#define lumaM rgbyM.w +#else +#define lumaM rgbyM.y +#endif +#endif +#if (FXAA_GREEN_AS_LUMA == 0) +FxaaFloat4 luma4A = FxaaTexAlpha4(tex, posM); +FxaaFloat4 luma4B = FxaaTexOffAlpha4(tex, posM, FxaaInt2(-1, -1)); +#else +FxaaFloat4 luma4A = FxaaTexGreen4(tex, posM); +FxaaFloat4 luma4B = FxaaTexOffGreen4(tex, posM, FxaaInt2(-1, -1)); +#endif +#if (FXAA_DISCARD == 1) +#define lumaM luma4A.w +#endif +#define lumaE luma4A.z +#define lumaS luma4A.x +#define lumaSE luma4A.y +#define lumaNW luma4B.w +#define lumaN luma4B.z +#define lumaW luma4B.x +#else +FxaaFloat4 rgbyM = FxaaTexTop(tex, posM); +#if (FXAA_GREEN_AS_LUMA == 0) +#define lumaM rgbyM.w +#else +#define lumaM rgbyM.y +#endif +FxaaFloat lumaS = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0, 1), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 0), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaN = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 0,-1), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 0), fxaaQualityRcpFrame.xy)); +#endif +/*--------------------------------------------------------------------------*/ +FxaaFloat maxSM = max(lumaS, lumaM); +FxaaFloat minSM = min(lumaS, lumaM); +FxaaFloat maxESM = max(lumaE, maxSM); +FxaaFloat minESM = min(lumaE, minSM); +FxaaFloat maxWN = max(lumaN, lumaW); +FxaaFloat minWN = min(lumaN, lumaW); +FxaaFloat rangeMax = max(maxWN, maxESM); +FxaaFloat rangeMin = min(minWN, minESM); +FxaaFloat rangeMaxScaled = rangeMax * fxaaQualityEdgeThreshold; +FxaaFloat range = rangeMax - rangeMin; +FxaaFloat rangeMaxClamped = max(fxaaQualityEdgeThresholdMin, rangeMaxScaled); +FxaaBool earlyExit = range < rangeMaxClamped; +/*--------------------------------------------------------------------------*/ +if(earlyExit) +#if (FXAA_DISCARD == 1) +FxaaDiscard; +#else +return rgbyM; +#endif +/*--------------------------------------------------------------------------*/ +#if (FXAA_GATHER4_ALPHA == 0) +FxaaFloat lumaNW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1,-1), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaSE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1, 1), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2( 1,-1), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); +#else +FxaaFloat lumaNE = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(1, -1), fxaaQualityRcpFrame.xy)); +FxaaFloat lumaSW = FxaaLuma(FxaaTexOff(tex, posM, FxaaInt2(-1, 1), fxaaQualityRcpFrame.xy)); +#endif +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaNS = lumaN + lumaS; +FxaaFloat lumaWE = lumaW + lumaE; +FxaaFloat subpixRcpRange = 1.0/range; +FxaaFloat subpixNSWE = lumaNS + lumaWE; +FxaaFloat edgeHorz1 = (-2.0 * lumaM) + lumaNS; +FxaaFloat edgeVert1 = (-2.0 * lumaM) + lumaWE; +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaNESE = lumaNE + lumaSE; +FxaaFloat lumaNWNE = lumaNW + lumaNE; +FxaaFloat edgeHorz2 = (-2.0 * lumaE) + lumaNESE; +FxaaFloat edgeVert2 = (-2.0 * lumaN) + lumaNWNE; +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaNWSW = lumaNW + lumaSW; +FxaaFloat lumaSWSE = lumaSW + lumaSE; +FxaaFloat edgeHorz4 = (abs(edgeHorz1) * 2.0) + abs(edgeHorz2); +FxaaFloat edgeVert4 = (abs(edgeVert1) * 2.0) + abs(edgeVert2); +FxaaFloat edgeHorz3 = (-2.0 * lumaW) + lumaNWSW; +FxaaFloat edgeVert3 = (-2.0 * lumaS) + lumaSWSE; +FxaaFloat edgeHorz = abs(edgeHorz3) + edgeHorz4; +FxaaFloat edgeVert = abs(edgeVert3) + edgeVert4; +/*--------------------------------------------------------------------------*/ +FxaaFloat subpixNWSWNESE = lumaNWSW + lumaNESE; +FxaaFloat lengthSign = fxaaQualityRcpFrame.x; +FxaaBool horzSpan = edgeHorz >= edgeVert; +FxaaFloat subpixA = subpixNSWE * 2.0 + subpixNWSWNESE; +/*--------------------------------------------------------------------------*/ +if(!horzSpan) lumaN = lumaW; +if(!horzSpan) lumaS = lumaE; +if(horzSpan) lengthSign = fxaaQualityRcpFrame.y; +FxaaFloat subpixB = (subpixA * (1.0/12.0)) - lumaM; +/*--------------------------------------------------------------------------*/ +FxaaFloat gradientN = lumaN - lumaM; +FxaaFloat gradientS = lumaS - lumaM; +FxaaFloat lumaNN = lumaN + lumaM; +FxaaFloat lumaSS = lumaS + lumaM; +FxaaBool pairN = abs(gradientN) >= abs(gradientS); +FxaaFloat gradient = max(abs(gradientN), abs(gradientS)); +if(pairN) lengthSign = -lengthSign; +FxaaFloat subpixC = FxaaSat(abs(subpixB) * subpixRcpRange); +/*--------------------------------------------------------------------------*/ +FxaaFloat2 posB; +posB.x = posM.x; +posB.y = posM.y; +FxaaFloat2 offNP; +offNP.x = (!horzSpan) ? 0.0 : fxaaQualityRcpFrame.x; +offNP.y = ( horzSpan) ? 0.0 : fxaaQualityRcpFrame.y; +if(!horzSpan) posB.x += lengthSign * 0.5; +if( horzSpan) posB.y += lengthSign * 0.5; +/*--------------------------------------------------------------------------*/ +FxaaFloat2 posN; +posN.x = posB.x - offNP.x * FXAA_QUALITY__P0; +posN.y = posB.y - offNP.y * FXAA_QUALITY__P0; +FxaaFloat2 posP; +posP.x = posB.x + offNP.x * FXAA_QUALITY__P0; +posP.y = posB.y + offNP.y * FXAA_QUALITY__P0; +FxaaFloat subpixD = ((-2.0)*subpixC) + 3.0; +FxaaFloat lumaEndN = FxaaLuma(FxaaTexTop(tex, posN)); +FxaaFloat subpixE = subpixC * subpixC; +FxaaFloat lumaEndP = FxaaLuma(FxaaTexTop(tex, posP)); +/*--------------------------------------------------------------------------*/ +if(!pairN) lumaNN = lumaSS; +FxaaFloat gradientScaled = gradient * 1.0/4.0; +FxaaFloat lumaMM = lumaM - lumaNN * 0.5; +FxaaFloat subpixF = subpixD * subpixE; +FxaaBool lumaMLTZero = lumaMM < 0.0; +/*--------------------------------------------------------------------------*/ +lumaEndN -= lumaNN * 0.5; +lumaEndP -= lumaNN * 0.5; +FxaaBool doneN = abs(lumaEndN) >= gradientScaled; +FxaaBool doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P1; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P1; +FxaaBool doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P1; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P1; +/*--------------------------------------------------------------------------*/ +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P2; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P2; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P2; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P2; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 3) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P3; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P3; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P3; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P3; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 4) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P4; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P4; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P4; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P4; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 5) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P5; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P5; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P5; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P5; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 6) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P6; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P6; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P6; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P6; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 7) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P7; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P7; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P7; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P7; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 8) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P8; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P8; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P8; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P8; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 9) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P9; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P9; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P9; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P9; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 10) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P10; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P10; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P10; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P10; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 11) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P11; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P11; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P11; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P11; +/*--------------------------------------------------------------------------*/ +#if (FXAA_QUALITY__PS > 12) +if(doneNP) { +if(!doneN) lumaEndN = FxaaLuma(FxaaTexTop(tex, posN.xy)); +if(!doneP) lumaEndP = FxaaLuma(FxaaTexTop(tex, posP.xy)); +if(!doneN) lumaEndN = lumaEndN - lumaNN * 0.5; +if(!doneP) lumaEndP = lumaEndP - lumaNN * 0.5; +doneN = abs(lumaEndN) >= gradientScaled; +doneP = abs(lumaEndP) >= gradientScaled; +if(!doneN) posN.x -= offNP.x * FXAA_QUALITY__P12; +if(!doneN) posN.y -= offNP.y * FXAA_QUALITY__P12; +doneNP = (!doneN) || (!doneP); +if(!doneP) posP.x += offNP.x * FXAA_QUALITY__P12; +if(!doneP) posP.y += offNP.y * FXAA_QUALITY__P12; +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +#endif +/*--------------------------------------------------------------------------*/ +} +/*--------------------------------------------------------------------------*/ +FxaaFloat dstN = posM.x - posN.x; +FxaaFloat dstP = posP.x - posM.x; +if(!horzSpan) dstN = posM.y - posN.y; +if(!horzSpan) dstP = posP.y - posM.y; +/*--------------------------------------------------------------------------*/ +FxaaBool goodSpanN = (lumaEndN < 0.0) != lumaMLTZero; +FxaaFloat spanLength = (dstP + dstN); +FxaaBool goodSpanP = (lumaEndP < 0.0) != lumaMLTZero; +FxaaFloat spanLengthRcp = 1.0/spanLength; +/*--------------------------------------------------------------------------*/ +FxaaBool directionN = dstN < dstP; +FxaaFloat dst = min(dstN, dstP); +FxaaBool goodSpan = directionN ? goodSpanN : goodSpanP; +FxaaFloat subpixG = subpixF * subpixF; +FxaaFloat pixelOffset = (dst * (-spanLengthRcp)) + 0.5; +FxaaFloat subpixH = subpixG * fxaaQualitySubpix; +/*--------------------------------------------------------------------------*/ +FxaaFloat pixelOffsetGood = goodSpan ? pixelOffset : 0.0; +FxaaFloat pixelOffsetSubpix = max(pixelOffsetGood, subpixH); +if(!horzSpan) posM.x += pixelOffsetSubpix * lengthSign; +if( horzSpan) posM.y += pixelOffsetSubpix * lengthSign; +#if (FXAA_DISCARD == 1) +return FxaaTexTop(tex, posM); +#else +return FxaaFloat4(FxaaTexTop(tex, posM).xyz, lumaM); +#endif +} +/*==========================================================================*/ +#endif + + + + +/*============================================================================ + +FXAA3 CONSOLE - PC VERSION +------------------------------------------------------------------------------ +Instead of using this on PC, I'd suggest just using FXAA Quality with +#define FXAA_QUALITY__PRESET 10 +Or +#define FXAA_QUALITY__PRESET 20 +Either are higher qualilty and almost as fast as this on modern PC GPUs. +============================================================================*/ +#if (FXAA_PC_CONSOLE == 1) +/*--------------------------------------------------------------------------*/ +FxaaFloat4 FxaaPixelShader( +// See FXAA Quality FxaaPixelShader() source for docs on Inputs! +FxaaFloat2 pos, +FxaaFloat4 fxaaConsolePosPos, +FxaaTex tex, +FxaaTex fxaaConsole360TexExpBiasNegOne, +FxaaTex fxaaConsole360TexExpBiasNegTwo, +FxaaFloat2 fxaaQualityRcpFrame, +FxaaFloat4 fxaaConsoleRcpFrameOpt, +FxaaFloat4 fxaaConsoleRcpFrameOpt2, +FxaaFloat4 fxaaConsole360RcpFrameOpt2, +FxaaFloat fxaaQualitySubpix, +FxaaFloat fxaaQualityEdgeThreshold, +FxaaFloat fxaaQualityEdgeThresholdMin, +FxaaFloat fxaaConsoleEdgeSharpness, +FxaaFloat fxaaConsoleEdgeThreshold, +FxaaFloat fxaaConsoleEdgeThresholdMin, +FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaNw = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.xy)); +FxaaFloat lumaSw = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.xw)); +FxaaFloat lumaNe = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.zy)); +FxaaFloat lumaSe = FxaaLuma(FxaaTexTop(tex, fxaaConsolePosPos.zw)); +/*--------------------------------------------------------------------------*/ +FxaaFloat4 rgbyM = FxaaTexTop(tex, pos.xy); +#if (FXAA_GREEN_AS_LUMA == 0) +FxaaFloat lumaM = rgbyM.w; +#else +FxaaFloat lumaM = rgbyM.y; +#endif +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaMaxNwSw = max(lumaNw, lumaSw); +lumaNe += 1.0/384.0; +FxaaFloat lumaMinNwSw = min(lumaNw, lumaSw); +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaMaxNeSe = max(lumaNe, lumaSe); +FxaaFloat lumaMinNeSe = min(lumaNe, lumaSe); +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaMax = max(lumaMaxNeSe, lumaMaxNwSw); +FxaaFloat lumaMin = min(lumaMinNeSe, lumaMinNwSw); +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaMaxScaled = lumaMax * fxaaConsoleEdgeThreshold; +/*--------------------------------------------------------------------------*/ +FxaaFloat lumaMinM = min(lumaMin, lumaM); +FxaaFloat lumaMaxScaledClamped = max(fxaaConsoleEdgeThresholdMin, lumaMaxScaled); +FxaaFloat lumaMaxM = max(lumaMax, lumaM); +FxaaFloat dirSwMinusNe = lumaSw - lumaNe; +FxaaFloat lumaMaxSubMinM = lumaMaxM - lumaMinM; +FxaaFloat dirSeMinusNw = lumaSe - lumaNw; +if(lumaMaxSubMinM < lumaMaxScaledClamped) return rgbyM; +/*--------------------------------------------------------------------------*/ +FxaaFloat2 dir; +dir.x = dirSwMinusNe + dirSeMinusNw; +dir.y = dirSwMinusNe - dirSeMinusNw; +/*--------------------------------------------------------------------------*/ +FxaaFloat2 dir1 = normalize(dir.xy); +FxaaFloat4 rgbyN1 = FxaaTexTop(tex, pos.xy - dir1 * fxaaConsoleRcpFrameOpt.zw); +FxaaFloat4 rgbyP1 = FxaaTexTop(tex, pos.xy + dir1 * fxaaConsoleRcpFrameOpt.zw); +/*--------------------------------------------------------------------------*/ +FxaaFloat dirAbsMinTimesC = min(abs(dir1.x), abs(dir1.y)) * fxaaConsoleEdgeSharpness; +FxaaFloat2 dir2 = clamp(dir1.xy / dirAbsMinTimesC, -2.0, 2.0); +/*--------------------------------------------------------------------------*/ +FxaaFloat4 rgbyN2 = FxaaTexTop(tex, pos.xy - dir2 * fxaaConsoleRcpFrameOpt2.zw); +FxaaFloat4 rgbyP2 = FxaaTexTop(tex, pos.xy + dir2 * fxaaConsoleRcpFrameOpt2.zw); +/*--------------------------------------------------------------------------*/ +FxaaFloat4 rgbyA = rgbyN1 + rgbyP1; +FxaaFloat4 rgbyB = ((rgbyN2 + rgbyP2) * 0.25) + (rgbyA * 0.25); +/*--------------------------------------------------------------------------*/ +#if (FXAA_GREEN_AS_LUMA == 0) +FxaaBool twoTap = (rgbyB.w < lumaMin) || (rgbyB.w > lumaMax); +#else +FxaaBool twoTap = (rgbyB.y < lumaMin) || (rgbyB.y > lumaMax); +#endif +if(twoTap) rgbyB.xyz = rgbyA.xyz * 0.5; +return rgbyB; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + +FXAA3 CONSOLE - 360 PIXEL SHADER + +------------------------------------------------------------------------------ +This optimized version thanks to suggestions from Andy Luedke. +Should be fully tex bound in all cases. +As of the FXAA 3.11 release, I have still not tested this code, +however I fixed a bug which was in both FXAA 3.9 and FXAA 3.10. +And note this is replacing the old unoptimized version. +If it does not work, please let me know so I can fix it. +============================================================================*/ +#if (FXAA_360 == 1) +/*--------------------------------------------------------------------------*/ +[reduceTempRegUsage(4)] +float4 FxaaPixelShader( +// See FXAA Quality FxaaPixelShader() source for docs on Inputs! +FxaaFloat2 pos, +FxaaFloat4 fxaaConsolePosPos, +FxaaTex tex, +FxaaTex fxaaConsole360TexExpBiasNegOne, +FxaaTex fxaaConsole360TexExpBiasNegTwo, +FxaaFloat2 fxaaQualityRcpFrame, +FxaaFloat4 fxaaConsoleRcpFrameOpt, +FxaaFloat4 fxaaConsoleRcpFrameOpt2, +FxaaFloat4 fxaaConsole360RcpFrameOpt2, +FxaaFloat fxaaQualitySubpix, +FxaaFloat fxaaQualityEdgeThreshold, +FxaaFloat fxaaQualityEdgeThresholdMin, +FxaaFloat fxaaConsoleEdgeSharpness, +FxaaFloat fxaaConsoleEdgeThreshold, +FxaaFloat fxaaConsoleEdgeThresholdMin, +FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ +float4 lumaNwNeSwSe; +#if (FXAA_GREEN_AS_LUMA == 0) +asm { +tfetch2D lumaNwNeSwSe.w___, tex, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false +tfetch2D lumaNwNeSwSe._w__, tex, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false +tfetch2D lumaNwNeSwSe.__w_, tex, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false +tfetch2D lumaNwNeSwSe.___w, tex, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false +}; +#else +asm { +tfetch2D lumaNwNeSwSe.y___, tex, pos.xy, OffsetX = -0.5, OffsetY = -0.5, UseComputedLOD=false +tfetch2D lumaNwNeSwSe._y__, tex, pos.xy, OffsetX = 0.5, OffsetY = -0.5, UseComputedLOD=false +tfetch2D lumaNwNeSwSe.__y_, tex, pos.xy, OffsetX = -0.5, OffsetY = 0.5, UseComputedLOD=false +tfetch2D lumaNwNeSwSe.___y, tex, pos.xy, OffsetX = 0.5, OffsetY = 0.5, UseComputedLOD=false +}; +#endif +/*--------------------------------------------------------------------------*/ +lumaNwNeSwSe.y += 1.0/384.0; +float2 lumaMinTemp = min(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); +float2 lumaMaxTemp = max(lumaNwNeSwSe.xy, lumaNwNeSwSe.zw); +float lumaMin = min(lumaMinTemp.x, lumaMinTemp.y); +float lumaMax = max(lumaMaxTemp.x, lumaMaxTemp.y); +/*--------------------------------------------------------------------------*/ +float4 rgbyM = tex2Dlod(tex, float4(pos.xy, 0.0, 0.0)); +#if (FXAA_GREEN_AS_LUMA == 0) +float lumaMinM = min(lumaMin, rgbyM.w); +float lumaMaxM = max(lumaMax, rgbyM.w); +#else +float lumaMinM = min(lumaMin, rgbyM.y); +float lumaMaxM = max(lumaMax, rgbyM.y); +#endif +if((lumaMaxM - lumaMinM) < max(fxaaConsoleEdgeThresholdMin, lumaMax * fxaaConsoleEdgeThreshold)) return rgbyM; +/*--------------------------------------------------------------------------*/ +float2 dir; +dir.x = dot(lumaNwNeSwSe, fxaaConsole360ConstDir.yyxx); +dir.y = dot(lumaNwNeSwSe, fxaaConsole360ConstDir.xyxy); +dir = normalize(dir); +/*--------------------------------------------------------------------------*/ +float4 dir1 = dir.xyxy * fxaaConsoleRcpFrameOpt.xyzw; +/*--------------------------------------------------------------------------*/ +float4 dir2; +float dirAbsMinTimesC = min(abs(dir.x), abs(dir.y)) * fxaaConsoleEdgeSharpness; +dir2 = saturate(fxaaConsole360ConstDir.zzww * dir.xyxy / dirAbsMinTimesC + 0.5); +dir2 = dir2 * fxaaConsole360RcpFrameOpt2.xyxy + fxaaConsole360RcpFrameOpt2.zwzw; +/*--------------------------------------------------------------------------*/ +float4 rgbyN1 = tex2Dlod(fxaaConsole360TexExpBiasNegOne, float4(pos.xy + dir1.xy, 0.0, 0.0)); +float4 rgbyP1 = tex2Dlod(fxaaConsole360TexExpBiasNegOne, float4(pos.xy + dir1.zw, 0.0, 0.0)); +float4 rgbyN2 = tex2Dlod(fxaaConsole360TexExpBiasNegTwo, float4(pos.xy + dir2.xy, 0.0, 0.0)); +float4 rgbyP2 = tex2Dlod(fxaaConsole360TexExpBiasNegTwo, float4(pos.xy + dir2.zw, 0.0, 0.0)); +/*--------------------------------------------------------------------------*/ +float4 rgbyA = rgbyN1 + rgbyP1; +float4 rgbyB = rgbyN2 + rgbyP2 + rgbyA * 0.5; +/*--------------------------------------------------------------------------*/ +float4 rgbyR = ((FxaaLuma(rgbyB) - lumaMax) > 0.0) ? rgbyA : rgbyB; +rgbyR = ((FxaaLuma(rgbyB) - lumaMin) > 0.0) ? rgbyR : rgbyA; +return rgbyR; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + +FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (NO EARLY EXIT) + +============================================================================== +The code below does not exactly match the assembly. +I have a feeling that 12 cycles is possible, but was not able to get there. +Might have to increase register count to get full performance. +Note this shader does not use perspective interpolation. + +Use the following cgc options, + +--fenable-bx2 --fastmath --fastprecision --nofloatbindings + +------------------------------------------------------------------------------ +NVSHADERPERF OUTPUT +------------------------------------------------------------------------------ +For reference and to aid in debug, output of NVShaderPerf should match this, + +Shader to schedule: +0: texpkb h0.w(TRUE), v5.zyxx, #0 +2: addh h2.z(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x +4: texpkb h0.w(TRUE), v5.xwxx, #0 +6: addh h0.z(TRUE), -h2, h0.w +7: texpkb h1.w(TRUE), v5, #0 +9: addh h0.x(TRUE), h0.z, -h1.w +10: addh h3.w(TRUE), h0.z, h1 +11: texpkb h2.w(TRUE), v5.zwzz, #0 +13: addh h0.z(TRUE), h3.w, -h2.w +14: addh h0.x(TRUE), h2.w, h0 +15: nrmh h1.xz(TRUE), h0_n +16: minh_m8 h0.x(TRUE), |h1|, |h1.z| +17: maxh h4.w(TRUE), h0, h1 +18: divx h2.xy(TRUE), h1_n.xzzw, h0_n +19: movr r1.zw(TRUE), v4.xxxy +20: madr r2.xz(TRUE), -h1, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zzww, r1.zzww +22: minh h5.w(TRUE), h0, h1 +23: texpkb h0(TRUE), r2.xzxx, #0 +25: madr r0.zw(TRUE), h1.xzxz, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w), r1 +27: maxh h4.x(TRUE), h2.z, h2.w +28: texpkb h1(TRUE), r0.zwzz, #0 +30: addh_d2 h1(TRUE), h0, h1 +31: madr r0.xy(TRUE), -h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz +33: texpkb h0(TRUE), r0, #0 +35: minh h4.z(TRUE), h2, h2.w +36: fenct TRUE +37: madr r1.xy(TRUE), h2, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz +39: texpkb h2(TRUE), r1, #0 +41: addh_d2 h0(TRUE), h0, h2 +42: maxh h2.w(TRUE), h4, h4.x +43: minh h2.x(TRUE), h5.w, h4.z +44: addh_d2 h0(TRUE), h0, h1 +45: slth h2.x(TRUE), h0.w, h2 +46: sgth h2.w(TRUE), h0, h2 +47: movh h0(TRUE), h0 +48: addx.c0 rc(TRUE), h2, h2.w +49: movh h0(c0.NE.x), h1 + +IPU0 ------ Simplified schedule: -------- +Pass | Unit | uOp | PC: Op +-----+--------+------+------------------------- +1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; +| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; +| SCB1 | add | 2: ADDh h2.z, h0.--w-, const.--x-; +| | | +2 | SCT0/1 | mov | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; +| TEX | txl | 4: TXLr h0.w, g[TEX1].xwxx, const.xxxx, TEX0; +| SCB1 | add | 6: ADDh h0.z,-h2, h0.--w-; +| | | +3 | SCT0/1 | mov | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; +| TEX | txl | 7: TXLr h1.w, g[TEX1], const.xxxx, TEX0; +| SCB0 | add | 9: ADDh h0.x, h0.z---,-h1.w---; +| SCB1 | add | 10: ADDh h3.w, h0.---z, h1; +| | | +4 | SCT0/1 | mov | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; +| TEX | txl | 11: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; +| SCB0 | add | 14: ADDh h0.x, h2.w---, h0; +| SCB1 | add | 13: ADDh h0.z, h3.--w-,-h2.--w-; +| | | +5 | SCT1 | mov | 15: NRMh h1.xz, h0; +| SRB | nrm | 15: NRMh h1.xz, h0; +| SCB0 | min | 16: MINh*8 h0.x, |h1|, |h1.z---|; +| SCB1 | max | 17: MAXh h4.w, h0, h1; +| | | +6 | SCT0 | div | 18: DIVx h2.xy, h1.xz--, h0; +| SCT1 | mov | 19: MOVr r1.zw, g[TEX0].--xy; +| SCB0 | mad | 20: MADr r2.xz,-h1, const.z-w-, r1.z-w-; +| SCB1 | min | 22: MINh h5.w, h0, h1; +| | | +7 | SCT0/1 | mov | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; +| TEX | txl | 23: TXLr h0, r2.xzxx, const.xxxx, TEX0; +| SCB0 | max | 27: MAXh h4.x, h2.z---, h2.w---; +| SCB1 | mad | 25: MADr r0.zw, h1.--xz, const, r1; +| | | +8 | SCT0/1 | mov | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; +| TEX | txl | 28: TXLr h1, r0.zwzz, const.xxxx, TEX0; +| SCB0/1 | add | 30: ADDh/2 h1, h0, h1; +| | | +9 | SCT0 | mad | 31: MADr r0.xy,-h2, const.xy--, r1.zw--; +| SCT1 | mov | 33: TXLr h0, r0, const.zzzz, TEX0; +| TEX | txl | 33: TXLr h0, r0, const.zzzz, TEX0; +| SCB1 | min | 35: MINh h4.z, h2, h2.--w-; +| | | +10 | SCT0 | mad | 37: MADr r1.xy, h2, const.xy--, r1.zw--; +| SCT1 | mov | 39: TXLr h2, r1, const.zzzz, TEX0; +| TEX | txl | 39: TXLr h2, r1, const.zzzz, TEX0; +| SCB0/1 | add | 41: ADDh/2 h0, h0, h2; +| | | +11 | SCT0 | min | 43: MINh h2.x, h5.w---, h4.z---; +| SCT1 | max | 42: MAXh h2.w, h4, h4.---x; +| SCB0/1 | add | 44: ADDh/2 h0, h0, h1; +| | | +12 | SCT0 | set | 45: SLTh h2.x, h0.w---, h2; +| SCT1 | set | 46: SGTh h2.w, h0, h2; +| SCB0/1 | mul | 47: MOVh h0, h0; +| | | +13 | SCT0 | mad | 48: ADDxc0_s rc, h2, h2.w---; +| SCB0/1 | mul | 49: MOVh h0(NE0.xxxx), h1; +Pass SCT TEX SCB +1: 0% 100% 25% +2: 0% 100% 25% +3: 0% 100% 50% +4: 0% 100% 50% +5: 0% 0% 50% +6: 100% 0% 75% +7: 0% 100% 75% +8: 0% 100% 100% +9: 0% 100% 25% +10: 0% 100% 100% +11: 50% 0% 100% +12: 50% 0% 100% +13: 25% 0% 100% + +MEAN: 17% 61% 67% + +Pass SCT0 SCT1 TEX SCB0 SCB1 +1: 0% 0% 100% 0% 100% +2: 0% 0% 100% 0% 100% +3: 0% 0% 100% 100% 100% +4: 0% 0% 100% 100% 100% +5: 0% 0% 0% 100% 100% +6: 100% 100% 0% 100% 100% +7: 0% 0% 100% 100% 100% +8: 0% 0% 100% 100% 100% +9: 0% 0% 100% 0% 100% +10: 0% 0% 100% 100% 100% +11: 100% 100% 0% 100% 100% +12: 100% 100% 0% 100% 100% +13: 100% 0% 0% 100% 100% + +MEAN: 30% 23% 61% 76% 100% +Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 +Results 13 cycles, 3 r regs, 923,076,923 pixels/s +============================================================================*/ +#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 0) +/*--------------------------------------------------------------------------*/ +#pragma regcount 7 +#pragma disablepc all +//#pragma option O3 +//#pragma option OutColorPrec=fp16 +#pragma texformat default RGBA8 +/*==========================================================================*/ +half4 FxaaPixelShader( +// See FXAA Quality FxaaPixelShader() source for docs on Inputs! +FxaaFloat2 pos, +FxaaFloat4 fxaaConsolePosPos, +FxaaTex tex, +//FxaaTex fxaaConsole360TexExpBiasNegOne, +//FxaaTex fxaaConsole360TexExpBiasNegTwo, +//FxaaFloat2 fxaaQualityRcpFrame, +FxaaFloat4 fxaaConsoleRcpFrameOpt, +FxaaFloat4 fxaaConsoleRcpFrameOpt2 +//FxaaFloat4 fxaaConsole360RcpFrameOpt2, +//FxaaFloat fxaaQualitySubpix, +//FxaaFloat fxaaQualityEdgeThreshold, +//FxaaFloat fxaaQualityEdgeThresholdMin, +//FxaaFloat fxaaConsoleEdgeSharpness, +//FxaaFloat fxaaConsoleEdgeThreshold, +//FxaaFloat fxaaConsoleEdgeThresholdMin, +//FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ +// (1) +half4 dir; +half4 lumaNe = tex2D(tex, fxaaConsolePosPos.zy); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.zy, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +lumaNe.w += half(1.0/512.0); +dir.x = -lumaNe.w; +dir.z = -lumaNe.w; +#else +lumaNe.y += half(1.0/512.0); +dir.x = -lumaNe.y; +dir.z = -lumaNe.y; +#endif +/*--------------------------------------------------------------------------*/ +// (2) +half4 lumaSw = tex2D(tex, fxaaConsolePosPos.xw); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.xw, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +dir.x += lumaSw.w; +dir.z += lumaSw.w; +#else +dir.x += lumaSw.y; +dir.z += lumaSw.y; +#endif +/*--------------------------------------------------------------------------*/ +// (3) +half4 lumaNw = tex2D(tex, fxaaConsolePosPos.xy); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.xy, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +dir.x -= lumaNw.w; +dir.z += lumaNw.w; +#else +dir.x -= lumaNw.y; +dir.z += lumaNw.y; +#endif +/*--------------------------------------------------------------------------*/ +// (4) +half4 lumaSe = tex2D(tex, fxaaConsolePosPos.zw); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.zw, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +dir.x += lumaSe.w; +dir.z -= lumaSe.w; +#else +dir.x += lumaSe.y; +dir.z -= lumaSe.y; +#endif +/*--------------------------------------------------------------------------*/ +// (5) +half4 dir1_pos; +dir1_pos.xy = normalize(dir.xz); +half dirAbsMinTimesC = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__PS3_EDGE_SHARPNESS); +/*--------------------------------------------------------------------------*/ +// (6) +half4 dir2_pos; +dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimesC, half(-2.0), half(2.0)); +dir1_pos.zw = pos.xy; +dir2_pos.zw = pos.xy; +half4 temp1N; +temp1N.xy = dir1_pos.zw - dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; +/*--------------------------------------------------------------------------*/ +// (7) +temp1N = tex2D(tex, temp1N.xy); // h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); +half4 rgby1; +rgby1.xy = dir1_pos.zw + dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; +/*--------------------------------------------------------------------------*/ +// (8) +rgby1 = tex2D(tex, rgby1.xy); // h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); +rgby1 = (temp1N + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (9) +half4 temp2N; +temp2N.xy = dir2_pos.zw - dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; +temp2N = tex2D(tex, temp2N.xy); // h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); +/*--------------------------------------------------------------------------*/ +// (10) +half4 rgby2; +rgby2.xy = dir2_pos.zw + dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; +rgby2 = tex2D(tex, rgby2.xy); // h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); +rgby2 = (temp2N + rgby2) * 0.5; +/*--------------------------------------------------------------------------*/ +// (11) +// compilier moves these scalar ops up to other cycles +#if (FXAA_GREEN_AS_LUMA == 0) +half lumaMin = min(min(lumaNw.w, lumaSw.w), min(lumaNe.w, lumaSe.w)); +half lumaMax = max(max(lumaNw.w, lumaSw.w), max(lumaNe.w, lumaSe.w)); +#else +half lumaMin = min(min(lumaNw.y, lumaSw.y), min(lumaNe.y, lumaSe.y)); +half lumaMax = max(max(lumaNw.y, lumaSw.y), max(lumaNe.y, lumaSe.y)); +#endif +rgby2 = (rgby2 + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (12) +#if (FXAA_GREEN_AS_LUMA == 0) +bool twoTapLt = rgby2.w < lumaMin; +bool twoTapGt = rgby2.w > lumaMax; +#else +bool twoTapLt = rgby2.y < lumaMin; +bool twoTapGt = rgby2.y > lumaMax; +#endif +/*--------------------------------------------------------------------------*/ +// (13) +if(twoTapLt || twoTapGt) rgby2 = rgby1; +/*--------------------------------------------------------------------------*/ +return rgby2; } +/*==========================================================================*/ +#endif + + + +/*============================================================================ + +FXAA3 CONSOLE - OPTIMIZED PS3 PIXEL SHADER (WITH EARLY EXIT) + +============================================================================== +The code mostly matches the assembly. +I have a feeling that 14 cycles is possible, but was not able to get there. +Might have to increase register count to get full performance. +Note this shader does not use perspective interpolation. + +Use the following cgc options, + +--fenable-bx2 --fastmath --fastprecision --nofloatbindings + +Use of FXAA_GREEN_AS_LUMA currently adds a cycle (16 clks). +Will look at fixing this for FXAA 3.12. +------------------------------------------------------------------------------ +NVSHADERPERF OUTPUT +------------------------------------------------------------------------------ +For reference and to aid in debug, output of NVShaderPerf should match this, + +Shader to schedule: +0: texpkb h0.w(TRUE), v5.zyxx, #0 +2: addh h2.y(TRUE), h0.w, constant(0.001953, 0.000000, 0.000000, 0.000000).x +4: texpkb h1.w(TRUE), v5.xwxx, #0 +6: addh h0.x(TRUE), h1.w, -h2.y +7: texpkb h2.w(TRUE), v5.zwzz, #0 +9: minh h4.w(TRUE), h2.y, h2 +10: maxh h5.x(TRUE), h2.y, h2.w +11: texpkb h0.w(TRUE), v5, #0 +13: addh h3.w(TRUE), -h0, h0.x +14: addh h0.x(TRUE), h0.w, h0 +15: addh h0.z(TRUE), -h2.w, h0.x +16: addh h0.x(TRUE), h2.w, h3.w +17: minh h5.y(TRUE), h0.w, h1.w +18: nrmh h2.xz(TRUE), h0_n +19: minh_m8 h2.w(TRUE), |h2.x|, |h2.z| +20: divx h4.xy(TRUE), h2_n.xzzw, h2_n.w +21: movr r1.zw(TRUE), v4.xxxy +22: maxh h2.w(TRUE), h0, h1 +23: fenct TRUE +24: madr r0.xy(TRUE), -h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz +26: texpkb h0(TRUE), r0, #0 +28: maxh h5.x(TRUE), h2.w, h5 +29: minh h5.w(TRUE), h5.y, h4 +30: madr r1.xy(TRUE), h2.xzzw, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).zwzz, r1.zwzz +32: texpkb h2(TRUE), r1, #0 +34: addh_d2 h2(TRUE), h0, h2 +35: texpkb h1(TRUE), v4, #0 +37: maxh h5.y(TRUE), h5.x, h1.w +38: minh h4.w(TRUE), h1, h5 +39: madr r0.xy(TRUE), -h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz +41: texpkb h0(TRUE), r0, #0 +43: addh_m8 h5.z(TRUE), h5.y, -h4.w +44: madr r2.xy(TRUE), h4, constant(cConst5.x, cConst5.y, cConst5.z, cConst5.w).xyxx, r1.zwzz +46: texpkb h3(TRUE), r2, #0 +48: addh_d2 h0(TRUE), h0, h3 +49: addh_d2 h3(TRUE), h0, h2 +50: movh h0(TRUE), h3 +51: slth h3.x(TRUE), h3.w, h5.w +52: sgth h3.w(TRUE), h3, h5.x +53: addx.c0 rc(TRUE), h3.x, h3 +54: slth.c0 rc(TRUE), h5.z, h5 +55: movh h0(c0.NE.w), h2 +56: movh h0(c0.NE.x), h1 + +IPU0 ------ Simplified schedule: -------- +Pass | Unit | uOp | PC: Op +-----+--------+------+------------------------- +1 | SCT0/1 | mov | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; +| TEX | txl | 0: TXLr h0.w, g[TEX1].zyxx, const.xxxx, TEX0; +| SCB0 | add | 2: ADDh h2.y, h0.-w--, const.-x--; +| | | +2 | SCT0/1 | mov | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; +| TEX | txl | 4: TXLr h1.w, g[TEX1].xwxx, const.xxxx, TEX0; +| SCB0 | add | 6: ADDh h0.x, h1.w---,-h2.y---; +| | | +3 | SCT0/1 | mov | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; +| TEX | txl | 7: TXLr h2.w, g[TEX1].zwzz, const.xxxx, TEX0; +| SCB0 | max | 10: MAXh h5.x, h2.y---, h2.w---; +| SCB1 | min | 9: MINh h4.w, h2.---y, h2; +| | | +4 | SCT0/1 | mov | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; +| TEX | txl | 11: TXLr h0.w, g[TEX1], const.xxxx, TEX0; +| SCB0 | add | 14: ADDh h0.x, h0.w---, h0; +| SCB1 | add | 13: ADDh h3.w,-h0, h0.---x; +| | | +5 | SCT0 | mad | 16: ADDh h0.x, h2.w---, h3.w---; +| SCT1 | mad | 15: ADDh h0.z,-h2.--w-, h0.--x-; +| SCB0 | min | 17: MINh h5.y, h0.-w--, h1.-w--; +| | | +6 | SCT1 | mov | 18: NRMh h2.xz, h0; +| SRB | nrm | 18: NRMh h2.xz, h0; +| SCB1 | min | 19: MINh*8 h2.w, |h2.---x|, |h2.---z|; +| | | +7 | SCT0 | div | 20: DIVx h4.xy, h2.xz--, h2.ww--; +| SCT1 | mov | 21: MOVr r1.zw, g[TEX0].--xy; +| SCB1 | max | 22: MAXh h2.w, h0, h1; +| | | +8 | SCT0 | mad | 24: MADr r0.xy,-h2.xz--, const.zw--, r1.zw--; +| SCT1 | mov | 26: TXLr h0, r0, const.xxxx, TEX0; +| TEX | txl | 26: TXLr h0, r0, const.xxxx, TEX0; +| SCB0 | max | 28: MAXh h5.x, h2.w---, h5; +| SCB1 | min | 29: MINh h5.w, h5.---y, h4; +| | | +9 | SCT0 | mad | 30: MADr r1.xy, h2.xz--, const.zw--, r1.zw--; +| SCT1 | mov | 32: TXLr h2, r1, const.xxxx, TEX0; +| TEX | txl | 32: TXLr h2, r1, const.xxxx, TEX0; +| SCB0/1 | add | 34: ADDh/2 h2, h0, h2; +| | | +10 | SCT0/1 | mov | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; +| TEX | txl | 35: TXLr h1, g[TEX0], const.xxxx, TEX0; +| SCB0 | max | 37: MAXh h5.y, h5.-x--, h1.-w--; +| SCB1 | min | 38: MINh h4.w, h1, h5; +| | | +11 | SCT0 | mad | 39: MADr r0.xy,-h4, const.xy--, r1.zw--; +| SCT1 | mov | 41: TXLr h0, r0, const.zzzz, TEX0; +| TEX | txl | 41: TXLr h0, r0, const.zzzz, TEX0; +| SCB0 | mad | 44: MADr r2.xy, h4, const.xy--, r1.zw--; +| SCB1 | add | 43: ADDh*8 h5.z, h5.--y-,-h4.--w-; +| | | +12 | SCT0/1 | mov | 46: TXLr h3, r2, const.xxxx, TEX0; +| TEX | txl | 46: TXLr h3, r2, const.xxxx, TEX0; +| SCB0/1 | add | 48: ADDh/2 h0, h0, h3; +| | | +13 | SCT0/1 | mad | 49: ADDh/2 h3, h0, h2; +| SCB0/1 | mul | 50: MOVh h0, h3; +| | | +14 | SCT0 | set | 51: SLTh h3.x, h3.w---, h5.w---; +| SCT1 | set | 52: SGTh h3.w, h3, h5.---x; +| SCB0 | set | 54: SLThc0 rc, h5.z---, h5; +| SCB1 | add | 53: ADDxc0_s rc, h3.---x, h3; +| | | +15 | SCT0/1 | mul | 55: MOVh h0(NE0.wwww), h2; +| SCB0/1 | mul | 56: MOVh h0(NE0.xxxx), h1; +Pass SCT TEX SCB +1: 0% 100% 25% +2: 0% 100% 25% +3: 0% 100% 50% +4: 0% 100% 50% +5: 50% 0% 25% +6: 0% 0% 25% +7: 100% 0% 25% +8: 0% 100% 50% +9: 0% 100% 100% +10: 0% 100% 50% +11: 0% 100% 75% +12: 0% 100% 100% +13: 100% 0% 100% +14: 50% 0% 50% +15: 100% 0% 100% + +MEAN: 26% 60% 56% + +Pass SCT0 SCT1 TEX SCB0 SCB1 +1: 0% 0% 100% 100% 0% +2: 0% 0% 100% 100% 0% +3: 0% 0% 100% 100% 100% +4: 0% 0% 100% 100% 100% +5: 100% 100% 0% 100% 0% +6: 0% 0% 0% 0% 100% +7: 100% 100% 0% 0% 100% +8: 0% 0% 100% 100% 100% +9: 0% 0% 100% 100% 100% +10: 0% 0% 100% 100% 100% +11: 0% 0% 100% 100% 100% +12: 0% 0% 100% 100% 100% +13: 100% 100% 0% 100% 100% +14: 100% 100% 0% 100% 100% +15: 100% 100% 0% 100% 100% + +MEAN: 33% 33% 60% 86% 80% +Fragment Performance Setup: Driver RSX Compiler, GPU RSX, Flags 0x5 +Results 15 cycles, 3 r regs, 800,000,000 pixels/s +============================================================================*/ +#if (FXAA_PS3 == 1) && (FXAA_EARLY_EXIT == 1) +/*--------------------------------------------------------------------------*/ +#pragma regcount 7 +#pragma disablepc all +//#pragma option O2 +//#pragma option OutColorPrec=fp16 +#pragma texformat default RGBA8 +/*==========================================================================*/ +half4 FxaaPixelShader( +// See FXAA Quality FxaaPixelShader() source for docs on Inputs! +FxaaFloat2 pos, +FxaaFloat4 fxaaConsolePosPos, +FxaaTex tex, +//FxaaTex fxaaConsole360TexExpBiasNegOne, +//FxaaTex fxaaConsole360TexExpBiasNegTwo, +//FxaaFloat2 fxaaQualityRcpFrame, +FxaaFloat4 fxaaConsoleRcpFrameOpt, +FxaaFloat4 fxaaConsoleRcpFrameOpt2 +//FxaaFloat4 fxaaConsole360RcpFrameOpt, +//FxaaFloat fxaaQualitySubpix, +//FxaaFloat fxaaQualityEdgeThreshold, +//FxaaFloat fxaaQualityEdgeThresholdMin, +//FxaaFloat fxaaConsoleEdgeSharpness, +//FxaaFloat fxaaConsoleEdgeThreshold, +//FxaaFloat fxaaConsoleEdgeThresholdMin, +//FxaaFloat4 fxaaConsole360ConstDir +) { +/*--------------------------------------------------------------------------*/ +// (1) +half4 rgbyNe = tex2D(tex, fxaaConsolePosPos.zy); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.zy, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +half lumaNe = rgbyNe.w + half(1.0/512.0); +#else +half lumaNe = rgbyNe.y + half(1.0/512.0); +#endif +/*--------------------------------------------------------------------------*/ +// (2) +half4 lumaSw = tex2D(tex, fxaaConsolePosPos.xw); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.xw, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +half lumaSwNegNe = lumaSw.w - lumaNe; +#else +half lumaSwNegNe = lumaSw.y - lumaNe; +#endif +/*--------------------------------------------------------------------------*/ +// (3) +half4 lumaNw = tex2D(tex, fxaaConsolePosPos.xy); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.xy, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +half lumaMaxNwSw = max(lumaNw.w, lumaSw.w); +half lumaMinNwSw = min(lumaNw.w, lumaSw.w); +#else +half lumaMaxNwSw = max(lumaNw.y, lumaSw.y); +half lumaMinNwSw = min(lumaNw.y, lumaSw.y); +#endif +/*--------------------------------------------------------------------------*/ +// (4) +half4 lumaSe = tex2D(tex, fxaaConsolePosPos.zw); // h4tex2Dlod(tex, half4(fxaaConsolePosPos.zw, 0, 0)); +#if (FXAA_GREEN_AS_LUMA == 0) +half dirZ = lumaNw.w + lumaSwNegNe; +half dirX = -lumaNw.w + lumaSwNegNe; +#else +half dirZ = lumaNw.y + lumaSwNegNe; +half dirX = -lumaNw.y + lumaSwNegNe; +#endif +/*--------------------------------------------------------------------------*/ +// (5) +half3 dir; +dir.y = 0.0; +#if (FXAA_GREEN_AS_LUMA == 0) +dir.x = lumaSe.w + dirX; +dir.z = -lumaSe.w + dirZ; +half lumaMinNeSe = min(lumaNe, lumaSe.w); +#else +dir.x = lumaSe.y + dirX; +dir.z = -lumaSe.y + dirZ; +half lumaMinNeSe = min(lumaNe, lumaSe.y); +#endif +/*--------------------------------------------------------------------------*/ +// (6) +half4 dir1_pos; +dir1_pos.xy = normalize(dir).xz; +half dirAbsMinTimes8 = min(abs(dir1_pos.x), abs(dir1_pos.y)) * half(FXAA_CONSOLE__PS3_EDGE_SHARPNESS); +/*--------------------------------------------------------------------------*/ +// (7) +half4 dir2_pos; +dir2_pos.xy = clamp(dir1_pos.xy / dirAbsMinTimes8, half(-2.0), half(2.0)); +dir1_pos.zw = pos.xy; +dir2_pos.zw = pos.xy; +#if (FXAA_GREEN_AS_LUMA == 0) +half lumaMaxNeSe = max(lumaNe, lumaSe.w); +#else +half lumaMaxNeSe = max(lumaNe, lumaSe.y); +#endif +/*--------------------------------------------------------------------------*/ +// (8) +half4 temp1N; +temp1N.xy = dir1_pos.zw - dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; +temp1N = tex2D(tex, temp1N.xy); // h4tex2Dlod(tex, half4(temp1N.xy, 0.0, 0.0)); +half lumaMax = max(lumaMaxNwSw, lumaMaxNeSe); +half lumaMin = min(lumaMinNwSw, lumaMinNeSe); +/*--------------------------------------------------------------------------*/ +// (9) +half4 rgby1; +rgby1.xy = dir1_pos.zw + dir1_pos.xy * fxaaConsoleRcpFrameOpt.zw; +rgby1 = tex2D(tex, rgby1.xy); // h4tex2Dlod(tex, half4(rgby1.xy, 0.0, 0.0)); +rgby1 = (temp1N + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (10) +half4 rgbyM = tex2D(tex, pos.xy); // h4tex2Dlod(tex, half4(pos.xy, 0.0, 0.0)); +#if (FXAA_GREEN_AS_LUMA == 0) +half lumaMaxM = max(lumaMax, rgbyM.w); +half lumaMinM = min(lumaMin, rgbyM.w); +#else +half lumaMaxM = max(lumaMax, rgbyM.y); +half lumaMinM = min(lumaMin, rgbyM.y); +#endif +/*--------------------------------------------------------------------------*/ +// (11) +half4 temp2N; +temp2N.xy = dir2_pos.zw - dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; +temp2N = tex2D(tex, temp2N.xy); // h4tex2Dlod(tex, half4(temp2N.xy, 0.0, 0.0)); +half4 rgby2; +rgby2.xy = dir2_pos.zw + dir2_pos.xy * fxaaConsoleRcpFrameOpt2.zw; +half lumaRangeM = (lumaMaxM - lumaMinM) / FXAA_CONSOLE__PS3_EDGE_THRESHOLD; +/*--------------------------------------------------------------------------*/ +// (12) +rgby2 = tex2D(tex, rgby2.xy); // h4tex2Dlod(tex, half4(rgby2.xy, 0.0, 0.0)); +rgby2 = (temp2N + rgby2) * 0.5; +/*--------------------------------------------------------------------------*/ +// (13) +rgby2 = (rgby2 + rgby1) * 0.5; +/*--------------------------------------------------------------------------*/ +// (14) +#if (FXAA_GREEN_AS_LUMA == 0) +bool twoTapLt = rgby2.w < lumaMin; +bool twoTapGt = rgby2.w > lumaMax; +#else +bool twoTapLt = rgby2.y < lumaMin; +bool twoTapGt = rgby2.y > lumaMax; +#endif +bool earlyExit = lumaRangeM < lumaMax; +bool twoTap = twoTapLt || twoTapGt; +/*--------------------------------------------------------------------------*/ +// (15) +if(twoTap) rgby2 = rgby1; +if(earlyExit) rgby2 = rgbyM; +/*--------------------------------------------------------------------------*/ +return rgby2; } +/*==========================================================================*/ +#endif \ No newline at end of file diff --git a/code/nel/src/3d/shaders/fxaa_pp.cg b/code/nel/src/3d/shaders/fxaa_pp.cg new file mode 100644 index 000000000..e4993ead8 --- /dev/null +++ b/code/nel/src/3d/shaders/fxaa_pp.cg @@ -0,0 +1,70 @@ + +#define FXAA_PS3 1 +#define FXAA_HLSL_3 1 +#define FXAA_QUALITY__PRESET 12 +#define FXAA_EARLY_EXIT 0 + +#define h4tex2Dlod tex2Dlod +#define half4 float4 +#define half3 float3 +#define half2 float2 +#define half float + +#include "fxaa3_11.h" + +void fxaa_pp( + // Per fragment parameters + float2 pos : TEXCOORD0, + float4 fxaaConsolePosPos : TEXCOORD1, + + // Fragment program constants + uniform float4 fxaaConsoleRcpFrameOpt, + uniform float4 fxaaConsoleRcpFrameOpt2, + uniform sampler2D nlTex0 : TEX0, + + // Output color + out float4 oCol : COLOR +) +{ + oCol = FxaaPixelShader( + pos, + fxaaConsolePosPos, + nlTex0, + fxaaConsoleRcpFrameOpt, + fxaaConsoleRcpFrameOpt2 + ); +} + +/* +Have FXAA vertex shader run as a full screen triangle, +and output "pos" and "fxaaConsolePosPos" +such that inputs in the pixel shader provide, + +// {xy} = center of pixel +FxaaFloat2 pos, + +// {xy__} = upper left of pixel +// {__zw} = lower right of pixel +FxaaFloat4 fxaaConsolePosPos, +*/ + +// fxaaConsoleRcpFrameOpt: +// Only used on FXAA Console. +// This must be from a constant/uniform. +// This effects sub-pixel AA quality and inversely sharpness. +// Where N ranges between, +// N = 0.50 (default) +// N = 0.33 (sharper) +// {x___} = -N/screenWidthInPixels +// {_y__} = -N/screenHeightInPixels +// {__z_} = N/screenWidthInPixels +// {___w} = N/screenHeightInPixels + +// fxaaConsoleRcpFrameOpt2: +// Only used on FXAA Console. +// Not used on 360, but used on PS3 and PC. +// This must be from a constant/uniform. +// {x___} = -2.0/screenWidthInPixels +// {_y__} = -2.0/screenHeightInPixels +// {__z_} = 2.0/screenWidthInPixels +// {___w} = 2.0/screenHeightInPixels diff --git a/code/nel/src/3d/shaders/fxaa_pp_arbfp1.txt b/code/nel/src/3d/shaders/fxaa_pp_arbfp1.txt new file mode 100644 index 000000000..73ecb767c --- /dev/null +++ b/code/nel/src/3d/shaders/fxaa_pp_arbfp1.txt @@ -0,0 +1,76 @@ +!!ARBfp1.0 +OPTION ARB_precision_hint_fastest; +# cgc version 3.1.0013, build date Apr 18 2012 +# command line args: -profile arbfp1 -O3 -fastmath -fastprecision +# source file: fxaa_pp.cg +#vendor NVIDIA Corporation +#version 3.1.0.13 +#profile arbfp1 +#program fxaa_pp +#semantic fxaa_pp.fxaaConsoleRcpFrameOpt +#semantic fxaa_pp.fxaaConsoleRcpFrameOpt2 +#semantic fxaa_pp.nlTex0 : TEX0 +#var float2 pos : $vin.TEXCOORD0 : TEX0 : 0 : 1 +#var float4 fxaaConsolePosPos : $vin.TEXCOORD1 : TEX1 : 1 : 1 +#var float4 fxaaConsoleRcpFrameOpt : : c[0] : 2 : 1 +#var float4 fxaaConsoleRcpFrameOpt2 : : c[1] : 3 : 1 +#var sampler2D nlTex0 : TEX0 : texunit 0 : 4 : 1 +#var float4 oCol : $vout.COLOR : COL : 5 : 1 +#const c[2] = 0.125 0 -2 2 +#const c[3] = 0.001953125 0.5 +PARAM c[4] = { program.local[0..1], + { 0.125, 0, -2, 2 }, + { 0.001953125, 0.5 } }; +TEMP R0; +TEMP R1; +TEMP R2; +TEMP R3; +TEMP R4; +TEMP R5; +TEX R1.w, fragment.texcoord[1].zyzw, texture[0], 2D; +ADD R0.x, R1.w, c[3]; +TEX R0.w, fragment.texcoord[1].xwzw, texture[0], 2D; +TEX R1.w, fragment.texcoord[1], texture[0], 2D; +ADD R0.y, -R0.x, R0.w; +ADD R0.z, R1.w, R0.y; +TEX R2.w, fragment.texcoord[1].zwzw, texture[0], 2D; +ADD R0.y, -R1.w, R0; +ADD R1.x, R2.w, R0.y; +ADD R1.y, R0.z, -R2.w; +MUL R2.xy, R1, R1; +ADD R0.y, R2.x, R2; +RSQ R0.y, R0.y; +MUL R2.xy, R0.y, R1; +MAD R3.xy, R2, c[0].zwzw, fragment.texcoord[0]; +ABS R0.z, R2.y; +ABS R0.y, R2.x; +MIN R0.y, R0, R0.z; +RCP R0.y, R0.y; +MUL R1.xy, R0.y, R2; +MUL R1.xy, R1, c[2].x; +MIN R1.xy, R1, c[2].w; +TEX R4, R3, texture[0], 2D; +MAD R2.xy, -R2, c[0].zwzw, fragment.texcoord[0]; +TEX R3, R2, texture[0], 2D; +ADD R3, R3, R4; +MAX R1.xy, R1, c[2].z; +MAD R2.xy, R1, c[1].zwzw, fragment.texcoord[0]; +MUL R5, R3, c[3].y; +MAD R1.xy, -R1, c[1].zwzw, fragment.texcoord[0]; +MIN R0.z, R0.x, R2.w; +MIN R0.y, R0.w, R1.w; +MIN R0.y, R0, R0.z; +MAX R0.z, R0.x, R2.w; +MAX R0.x, R0.w, R1.w; +MAX R0.x, R0, R0.z; +TEX R4, R2, texture[0], 2D; +TEX R3, R1, texture[0], 2D; +ADD R3, R3, R4; +MAD R3, R3, c[3].y, R5; +MUL R3, R3, c[3].y; +SLT R0.z, R0.x, R3.w; +SLT R0.x, R3.w, R0.y; +ADD_SAT R0.x, R0, R0.z; +CMP result.color, -R0.x, R5, R3; +END +# 45 instructions, 6 R-regs diff --git a/code/nel/src/3d/shaders/fxaa_pp_ps_2_0.txt b/code/nel/src/3d/shaders/fxaa_pp_ps_2_0.txt new file mode 100644 index 000000000..fcd16fcd0 --- /dev/null +++ b/code/nel/src/3d/shaders/fxaa_pp_ps_2_0.txt @@ -0,0 +1,92 @@ +ps_2_0 +// cgc version 3.1.0013, build date Apr 18 2012 +// command line args: -profile ps_2_0 -O3 -fastmath -fastprecision +// source file: fxaa_pp.cg +//vendor NVIDIA Corporation +//version 3.1.0.13 +//profile ps_2_0 +//program fxaa_pp +//semantic fxaa_pp.fxaaConsoleRcpFrameOpt +//semantic fxaa_pp.fxaaConsoleRcpFrameOpt2 +//semantic fxaa_pp.nlTex0 : TEX0 +//var float2 pos : $vin.TEXCOORD0 : TEX0 : 0 : 1 +//var float4 fxaaConsolePosPos : $vin.TEXCOORD1 : TEX1 : 1 : 1 +//var float4 fxaaConsoleRcpFrameOpt : : c[0] : 2 : 1 +//var float4 fxaaConsoleRcpFrameOpt2 : : c[1] : 3 : 1 +//var sampler2D nlTex0 : TEX0 : texunit 0 : 4 : 1 +//var float4 oCol : $vout.COLOR : COL : 5 : 1 +//const c[2] = 0.001953125 0.125 2 -2 +//const c[3] = 0.5 0 1 +dcl_2d s0 +def c2, 0.00195313, 0.12500000, 2.00000000, -2.00000000 +def c3, 0.50000000, 0.00000000, 1.00000000, 0 +dcl t1 +dcl t0.xy +texld r5, t1, s0 +mov r1.y, t1.w +mov r1.x, t1.z +mov r2.xy, r1 +mov r0.y, t1.w +mov r0.x, t1 +mov r1.y, t1 +mov r1.x, t1.z +texld r1, r1, s0 +texld r0, r0, s0 +texld r6, r2, s0 +add r0.x, r1.w, c2 +add r2.x, -r0, r0.w +add r1.x, r5.w, r2 +add r2.z, r1.x, -r6.w +add r2.x, -r5.w, r2 +add r2.x, r6.w, r2 +mov r3.x, r2 +mov r3.y, r2.z +mov r2.y, r2.z +mov r1.y, r2.z +mov r1.x, r2 +mul r1.xy, r3, r1 +add r1.x, r1, r1.y +rsq r1.x, r1.x +mul r4.xy, r1.x, r2 +abs r2.x, r4.y +abs r1.x, r4 +min r1.x, r1, r2 +rcp r1.x, r1.x +mul r1.xy, r1.x, r4 +mul r1.xy, r1, c2.y +min r1.xy, r1, c2.z +max r2.xy, r1, c2.w +mov r1.y, c1.w +mov r1.x, c1.z +mad r3.xy, r2, r1, t0 +mov r1.y, c1.w +mov r1.x, c1.z +mad r5.xy, -r2, r1, t0 +mov r1.y, c0.w +mov r1.x, c0.z +mad r2.xy, -r4, r1, t0 +mov r1.y, c0.w +mov r1.x, c0.z +mad r1.xy, r4, r1, t0 +texld r4, r5, s0 +texld r3, r3, s0 +texld r1, r1, s0 +texld r2, r2, s0 +add r1, r2, r1 +mul r2, r1, c3.x +add r1, r4, r3 +max r3.x, r0, r6.w +mad r1, r1, c3.x, r2 +mul r4, r1, c3.x +max r1.x, r0.w, r5.w +max r1.x, r1, r3 +add r1.x, -r4.w, r1 +min r3.x, r0.w, r5.w +min r0.x, r0, r6.w +min r0.x, r3, r0 +add r0.x, r4.w, -r0 +cmp r1.x, r1, c3.y, c3.z +cmp r0.x, r0, c3.y, c3.z +add_pp_sat r0.x, r0, r1 +cmp r0, -r0.x, r4, r2 +mov oC0, r0 diff --git a/code/nel/src/3d/shaders/fxaa_vp.cg b/code/nel/src/3d/shaders/fxaa_vp.cg new file mode 100644 index 000000000..13c9c9bcb --- /dev/null +++ b/code/nel/src/3d/shaders/fxaa_vp.cg @@ -0,0 +1,20 @@ + +void fxaa_vp( + // Per vertex parameters + float3 position : POSITION, + float2 texCoord0 : TEXCOORD0, + + // Vertex program constants + uniform float4x4 modelViewProjection, + uniform float4 fxaaConsolePosPos, + + // Output position + out float4 oPosition : POSITION, + out float2 oTexCoord0 : TEXCOORD0, + out float4 oTexCoord1 : TEXCOORD1 +) +{ + oPosition = mul(modelViewProjection, float4(position, 0.0)); + oTexCoord0 = texCoord0; + oTexCoord1 = texCoord0.xyxy + fxaaConsolePosPos; +} diff --git a/code/nel/src/3d/shaders/fxaa_vp_arbvp1.txt b/code/nel/src/3d/shaders/fxaa_vp_arbvp1.txt new file mode 100644 index 000000000..6c8530ae0 --- /dev/null +++ b/code/nel/src/3d/shaders/fxaa_vp_arbvp1.txt @@ -0,0 +1,31 @@ +!!ARBvp1.0 +# cgc version 3.1.0013, build date Apr 18 2012 +# command line args: -profile arbvp1 -fastmath -fastprecision +# source file: fxaa_vp.cg +#vendor NVIDIA Corporation +#version 3.1.0.13 +#profile arbvp1 +#program fxaa_vp +#semantic fxaa_vp.modelViewProjection +#semantic fxaa_vp.fxaaConsolePosPos +#var float3 position : $vin.POSITION : POSITION : 0 : 1 +#var float2 texCoord0 : $vin.TEXCOORD0 : TEXCOORD0 : 1 : 1 +#var float4x4 modelViewProjection : : c[1], 4 : 2 : 1 +#var float4 fxaaConsolePosPos : : c[5] : 3 : 1 +#var float4 oPosition : $vout.POSITION : HPOS : 4 : 1 +#var float2 oTexCoord0 : $vout.TEXCOORD0 : TEX0 : 5 : 1 +#var float4 oTexCoord1 : $vout.TEXCOORD1 : TEX1 : 6 : 1 +#const c[0] = 0 +PARAM c[6] = { { 0 }, + program.local[1..5] }; +TEMP R0; +MOV R0.w, c[0].x; +MOV R0.xyz, vertex.position; +DP4 result.position.w, R0, c[4]; +DP4 result.position.z, R0, c[3]; +DP4 result.position.y, R0, c[2]; +DP4 result.position.x, R0, c[1]; +ADD result.texcoord[1], vertex.texcoord[0].xyxy, c[5]; +MOV result.texcoord[0].xy, vertex.texcoord[0]; +END +# 8 instructions, 1 R-regs diff --git a/code/nel/src/3d/shaders/readme.txt b/code/nel/src/3d/shaders/readme.txt new file mode 100644 index 000000000..fe657f6e5 --- /dev/null +++ b/code/nel/src/3d/shaders/readme.txt @@ -0,0 +1,4 @@ +Compiled shaders are embedded in the source. +Must compile and re-embed manually. + +FXAA is in public domain. \ No newline at end of file diff --git a/code/ryzom/client/src/main_loop.cpp b/code/ryzom/client/src/main_loop.cpp index 4dfb2870a..ad1cb403e 100644 --- a/code/ryzom/client/src/main_loop.cpp +++ b/code/ryzom/client/src/main_loop.cpp @@ -1730,12 +1730,12 @@ bool mainLoop() { if (effectRender) { - if (ClientCfg.Bloom) - { - if (StereoDisplay) Driver->setViewport(NL3D::CViewport()); - CBloomEffect::instance().applyBloom(); - if (StereoDisplay) Driver->setViewport(StereoDisplay->getCurrentViewport()); - } + if (StereoDisplay) Driver->setViewport(NL3D::CViewport()); + UCamera pCam = Scene->getCam(); + Driver->setMatrixMode2D11(); + if (ClientCfg.Bloom) CBloomEffect::instance().applyBloom(); + Driver->setMatrixMode3D(pCam); + if (StereoDisplay) Driver->setViewport(StereoDisplay->getCurrentViewport()); effectRender = false; } diff --git a/code/snowballs2/client/src/snowballs_client.cpp b/code/snowballs2/client/src/snowballs_client.cpp index 2d53bf65e..68cdc47fa 100644 --- a/code/snowballs2/client/src/snowballs_client.cpp +++ b/code/snowballs2/client/src/snowballs_client.cpp @@ -801,8 +801,11 @@ void loopIngame() if (effectRender) { if (StereoDisplay) Driver->setViewport(NL3D::CViewport()); - if (s_EnableBloom) CBloomEffect::instance().applyBloom(); + UCamera pCam = Scene->getCam(); + Driver->setMatrixMode2D11(); if (s_FXAA) s_FXAA->applyEffect(); + if (s_EnableBloom) CBloomEffect::instance().applyBloom(); + Driver->setMatrixMode3D(pCam); if (StereoDisplay) Driver->setViewport(StereoDisplay->getCurrentViewport()); effectRender = false; }