// NeL - MMORPG Framework <http://dev.ryzom.com/projects/nel/>
// Copyright (C) 2010  Winch Gate Property Limited
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

#include "std3d.h"

#include "nel/3d/texture_far.h"
#include "nel/3d/tile_far_bank.h"
#include "nel/3d/patch.h"
#include "nel/3d/tile_color.h"
#include "nel/3d/zone.h"
#include "nel/3d/landscape.h"
#include "nel/misc/system_info.h"


using namespace NLMISC;
using namespace NL3D;
using namespace std;

namespace NL3D {

CRGBA CTextureFar::_LightmapExpanded[NL_NUM_PIXELS_ON_FAR_TILE_EDGE*NL_MAX_TILES_BY_PATCH_EDGE*NL_NUM_PIXELS_ON_FAR_TILE_EDGE*NL_MAX_TILES_BY_PATCH_EDGE];
uint8 CTextureFar::_LumelExpanded[(NL_MAX_TILES_BY_PATCH_EDGE*NL_LUMEL_BY_TILE+1)*(NL_MAX_TILES_BY_PATCH_EDGE*NL_LUMEL_BY_TILE+1)];
CRGBA CTextureFar::_TileTLIColors[(NL_MAX_TILES_BY_PATCH_EDGE+1)*(NL_MAX_TILES_BY_PATCH_EDGE+1)];

// ***************************************************************************
CTextureFar::CTextureFar()
{
	/* NB: define Values work only if NL_MAX_TILES_BY_PATCH_EDGE is 16.
		Else must change NL_MAX_FAR_EDGE and NL_NUM_RECTANGLE_RATIO
	*/
	nlctassert(NL_MAX_TILES_BY_PATCH_EDGE==16);

	// This texture is releasable. It doesn't stays in standard memory after been uploaded into video memory.
	setReleasable (true);

	// Init upload format 16 bits
	setUploadFormat(RGB565);

	// Set filter mode. No mipmap!
	setFilterMode (Linear, LinearMipMapOff);

	// Wrap
	setWrapS (Clamp);
	setWrapT (Clamp);

	// init update Lighting
	_ULPrec= this;
	_ULNext= this;

	// Start With All Patch of Max Far (64x64) Frees!
	uint	freeListId= getFreeListId(NL_MAX_FAR_PATCH_EDGE, NL_MAX_FAR_PATCH_EDGE);
	for(uint i=0;i<NL_NUM_FAR_BIGGEST_PATCH_PER_EDGE;i++)
	{
		for(uint j=0;j<NL_NUM_FAR_BIGGEST_PATCH_PER_EDGE;j++)
		{
			CVector2s	pos;
			pos.x= i*NL_MAX_FAR_PATCH_EDGE;
			pos.y= j*NL_MAX_FAR_PATCH_EDGE;

			// add this place to the free list.
			_FreeSpaces[freeListId].push_back(pos);
		}
	}

	// reset
	_ItULPatch= _PatchToPosMap.end();
}

// ***************************************************************************
CTextureFar::~CTextureFar()
{
	// verify the textureFar is correctly unlinked from any ciruclar list.
	nlassert(_ULPrec==this && _ULNext==this);
}


// ***************************************************************************
void CTextureFar::linkBeforeUL(CTextureFar *textNext)
{
	nlassert(textNext);

	// first, unlink others from me. NB: works even if _ULPrec==_ULNext==this.
	_ULNext->_ULPrec= _ULPrec;
	_ULPrec->_ULNext= _ULNext;
	// link to igNext.
	_ULNext= textNext;
	_ULPrec= textNext->_ULPrec;
	// link others to me.
	_ULNext->_ULPrec= this;
	_ULPrec->_ULNext= this;
}

// ***************************************************************************
void CTextureFar::unlinkUL()
{
	// first, unlink others from me. NB: works even if _ULPrec==_ULNext==this.
	_ULNext->_ULPrec= _ULPrec;
	_ULPrec->_ULNext= _ULNext;
	// reset
	_ULPrec= this;
	_ULNext= this;
}


// ***************************************************************************
uint	CTextureFar::getFreeListId(uint width, uint height)
{
	nlassert(width>=height);
	nlassert(isPowerOf2(width));
	nlassert(isPowerOf2(height));
	nlassert(width<=NL_MAX_FAR_PATCH_EDGE);

	// compute the level index
	uint	sizeIndex= getPowerOf2(NL_MAX_FAR_PATCH_EDGE / width);
	nlassert(sizeIndex < NL_NUM_FAR_PATCH_EDGE_LEVEL);

	// Compute the aspect ratio index.
	uint	aspectRatioIndex= getPowerOf2(width/height);
	nlassert(aspectRatioIndex < NL_NUM_FAR_RECTANGLE_RATIO );

	return sizeIndex*NL_NUM_FAR_RECTANGLE_RATIO + aspectRatioIndex;
}


// ***************************************************************************
bool	CTextureFar::getUpperSize(uint &width, uint &height)
{
	nlassert(width>=height);
	nlassert(isPowerOf2(width));
	nlassert(isPowerOf2(height));

	// if height is smaller than widht, then reduce the ratio
	if(width>height)
	{
		height*= 2;
		return true;
	}
	else
	{
		// else raise up to the next square level, if possible
		if(width<NL_MAX_FAR_PATCH_EDGE)
		{
			width*= 2;
			height*= 2;
			return true;
		}
		else
			return false;
	}
}


// ***************************************************************************
sint	CTextureFar::tryAllocatePatch (CPatch *pPatch, uint farIndex)
{
	// get the size of the subtexture to allocate
	uint width=(pPatch->getOrderS ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);
	uint height=(pPatch->getOrderT ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);

	// make width the biggest
	if(width<height)
		std::swap(width, height);

	// get where to find a subtexture
	uint	freeListId= getFreeListId(width, height);

	// if some place, ok!
	if(!_FreeSpaces[freeListId].empty())
		return 0;
	else
	{
		// try to get the next size
		while( getUpperSize(width, height) )
		{
			freeListId= getFreeListId(width, height);
			// if some subtexture free
			if(!_FreeSpaces[freeListId].empty())
			{
				// Ok! return the size of this texture we must split
				return width*height;
			}
		}

		// fail => no more space => -1
		return -1;
	}
}

// ***************************************************************************
void	CTextureFar::recursSplitNext(uint wson, uint hson)
{
	// get the upper subTexture
	uint	wup= wson, hup= hson;
	nlverify( getUpperSize(wup, hup) );

	// get the list id.
	uint	fatherListId= getFreeListId(wup, hup);

	// if must split bigger patch...
	if(_FreeSpaces[fatherListId].empty())
	{
		// recurs, try to get a bigger subtexture and split it.
		recursSplitNext(wup, hup);
	}

	// OK, now we should have a free entry.
	nlassert( !_FreeSpaces[fatherListId].empty() );

	// remove from free list, because it is split now!
	CVector2s	fatherPos= _FreeSpaces[fatherListId].front();
	_FreeSpaces[fatherListId].pop_front();

	// Create New free rectangles for sons
	uint	sonListId= getFreeListId(wson, hson);
	CVector2s	sonPos;

	// if my son is a rectangle son
	if(wson>hson)
	{
		// Then Add 2 free Spaces!
		sonPos.x= fatherPos.x;
		// 1st.
		sonPos.y= fatherPos.y;
		_FreeSpaces[sonListId].push_back(sonPos);
		// 2nd.
		sonPos.y= fatherPos.y+hson;
		_FreeSpaces[sonListId].push_back(sonPos);
	}
	else
	{
		// Then Add 4 free Spaces!
		// 1st.
		sonPos.x= fatherPos.x;
		sonPos.y= fatherPos.y;
		_FreeSpaces[sonListId].push_back(sonPos);
		// 2nd.
		sonPos.x= fatherPos.x+wson;
		sonPos.y= fatherPos.y;
		_FreeSpaces[sonListId].push_back(sonPos);
		// 3rd.
		sonPos.x= fatherPos.x;
		sonPos.y= fatherPos.y+hson;
		_FreeSpaces[sonListId].push_back(sonPos);
		// 4th.
		sonPos.x= fatherPos.x+wson;
		sonPos.y= fatherPos.y+hson;
		_FreeSpaces[sonListId].push_back(sonPos);
	}

}


// ***************************************************************************
void	CTextureFar::allocatePatch (CPatch *pPatch, uint farIndex, float& farUScale, float& farVScale, float& farUBias, float& farVBias, bool& bRot)
{
	// get the size of the subtexture to allocate
	uint width=(pPatch->getOrderS ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);
	uint height=(pPatch->getOrderT ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);

	// make width the biggest
	if(width<height)
		std::swap(width, height);

	// get where to find a subtexture
	uint	freeListId= getFreeListId(width, height);

	// if free list is empty, must split bigger patch...
	if(_FreeSpaces[freeListId].empty())
	{
		// try to get a bigger subtexture and split it.
		recursSplitNext(width, height);
	}

	// now the list should have som free space.
	nlassert( !_FreeSpaces[freeListId].empty() );
	CVector2s	pos= _FreeSpaces[freeListId].front();

	// Allocate. Add this entry to the maps
	CPatchIdent	pid;
	pid.Patch= pPatch;
	pid.FarIndex= farIndex;
	// verify not already here.
	nlassert( _PatchToPosMap.find(pid) == _PatchToPosMap.end() );
	_PatchToPosMap[pid]= pos;
	_PosToPatchMap[pos]= pid;

	// remove from free list.
	_FreeSpaces[freeListId].pop_front();

	// Invalidate the rectangle
	CRect rect (pos.x, pos.y, width, height);
	ITexture::touchRect (rect);

	// ** Return some values

	// Rotation flag
	bRot = ( pPatch->getOrderS() < pPatch->getOrderT() );

	// Scale is the same for all
	farUScale=(float)(width-1)/(float)NL_FAR_TEXTURE_EDGE_SIZE;
	farVScale=(float)(height-1)/(float)NL_FAR_TEXTURE_EDGE_SIZE;

	// UBias is the same for all
	farUBias=((float)pos.x+0.5f)/(float)NL_FAR_TEXTURE_EDGE_SIZE;

	// UBias is the same for all
	farVBias=((float)pos.y+0.5f)/(float)NL_FAR_TEXTURE_EDGE_SIZE;
}


// ***************************************************************************
// Remove a patch in the CTexture Patch
void	CTextureFar::removePatch (CPatch *pPatch, uint farIndex)
{
	// must be found
	CPatchIdent	pid;
	pid.Patch= pPatch;
	pid.FarIndex= farIndex;
	TPatchToPosMap::iterator	it= _PatchToPosMap.find(pid);
	nlassert( it != _PatchToPosMap.end() );

	// get the pos where this patch texture is stored
	CVector2s	pos= it->second;

	// If I erase the patch wihch must next UL, then update UL
	if( it == _ItULPatch )
		_ItULPatch++;

	// erase from the 1st map
	_PatchToPosMap.erase(it);

	// erase from the second map
	_PosToPatchMap.erase(pos);

	// Append to the free list.
	uint width=(pPatch->getOrderS ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);
	uint height=(pPatch->getOrderT ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);
	if(width<height)
		std::swap(width, height);
	uint	freeListId= getFreeListId(width, height);
	_FreeSpaces[freeListId].push_back(pos);
}


// ***************************************************************************
uint	CTextureFar::touchPatchULAndNext()
{
	// if there is still a patch here
	if( _ItULPatch!=_PatchToPosMap.end() )
	{
		// Position of the invalide rectangle
		int x = _ItULPatch->second.x;
		int y = _ItULPatch->second.y;
		uint	farIndex= _ItULPatch->first.FarIndex;
		CPatch	*pPatch= _ItULPatch->first.Patch;

		// recompute the correct size.
		uint width=(pPatch->getOrderS ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);
		uint height=(pPatch->getOrderT ()*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(farIndex-1);
		if(width<height)
			std::swap(width, height);

		// Invalidate the associated rectangle
		CRect rect (x, y, width, height);
		ITexture::touchRect (rect);

		// Go next.
		_ItULPatch++;

		// return number of pixels touched
		return width * height;
	}
	else
	{
		// no touch
		return 0;
	}
}


// ***************************************************************************
void	CTextureFar::startPatchULTouch()
{
	_ItULPatch= _PatchToPosMap.begin();
}


// ***************************************************************************
bool	CTextureFar::endPatchULTouch() const
{
	return _ItULPatch == _PatchToPosMap.end();
}



// ***************************************************************************
// Generate the texture. See ITexture::doGenerate().
void CTextureFar::doGenerate (bool async)
{
	// Resize. But don't need to fill with 0!!
	CBitmap::resize (NL_FAR_TEXTURE_EDGE_SIZE, NL_FAR_TEXTURE_EDGE_SIZE, RGBA, false);

	// Rectangle invalidate ?
	if (_ListInvalidRect.begin()!=_ListInvalidRect.end())
	{
		// Yes, rebuild only those rectangles.

		// For each rectangle to compute
		std::list<NLMISC::CRect>::iterator ite=_ListInvalidRect.begin();
		while (ite!=_ListInvalidRect.end())
		{
			// Get the PatchIdent.
			CVector2s	pos((uint16)ite->left(), (uint16)ite->top());
			TPosToPatchMap::iterator	itPosToPid= _PosToPatchMap.find( pos );
			// If the patch is still here...
			if( itPosToPid!=_PosToPatchMap.end() )
			{
				// ReBuild the rectangle.
				rebuildPatch (pos, itPosToPid->second);
			}

			// Next rectangle
			ite++;
		}
	}
	else
	{
		// Parse all existing Patchs.
		TPosToPatchMap::iterator	itPosToPid= _PosToPatchMap.begin();
		while( itPosToPid!= _PosToPatchMap.end() )
		{
			// ReBuild the rectangle.
			rebuildPatch (itPosToPid->first, itPosToPid->second);

			itPosToPid++;
		}
	}
}


// ***************************************************************************
// Rebuild the rectangle passed with coordinate passed in parameter
void CTextureFar::rebuildPatch (const CVector2s texturePos, const CPatchIdent &pid)
{
	uint x= texturePos.x;
	uint y= texturePos.y;

	// Patch pointer
	CPatch* patch= pid.Patch;

	// Check it exists
	nlassert (patch);

	// get the order
	uint nS=patch->getOrderS();
	uint nT=patch->getOrderT();

	// get the size of the texture to compute
	uint subTextWidth=(nS*NL_NUM_PIXELS_ON_FAR_TILE_EDGE)>>(pid.FarIndex-1);

	// Check it is a 16 bits texture
	nlassert (getPixelFormat()==RGBA);

	// Check pixels exist
	nlassert (getPixels().size()!=0);

	// Base offset of the first pixel of the patch's texture
	uint	nBaseOffset;

	// Delta to add to the destination offset when walk for a pixel to the right in the source tile
	sint dstDeltaX;

	// Delta to add to the destination offset when walk for a pixel to the bottom in the source tile
	sint dstDeltaY;

	// larger than higher  (regular)
	if (nS>=nT)
	{
		// Regular offset, top left
		nBaseOffset= x + y*_Width;

		// Regular deltaX, to the right
		dstDeltaX=1;

		// Regular deltaY, to the bottom
		dstDeltaY=_Width;
	}
	// higher than larger (goofy), the patch is stored with a rotation of 1 (to the left of course)
	else
	{
		// Goofy offset, bottom left
		nBaseOffset= x + y*_Width;
		nBaseOffset+=(subTextWidth-1)*_Width;

		// Goofy deltaX, to the top
		dstDeltaX=-(sint)_Width;

		// Goofy deltaY, to the right
		dstDeltaY=1;
	}

	// Compute the order of the patch
	CTileFarBank::TFarOrder orderX=CTileFarBank::order0;
	uint tileSize=0;
	switch (pid.FarIndex)
	{
	case 3:
		// Ratio 1:4
		orderX=CTileFarBank::order2;
		tileSize=NL_NUM_PIXELS_ON_FAR_TILE_EDGE>>2;
		break;
	case 2:
		// Ratio 1:2
		orderX=CTileFarBank::order1;
		tileSize=NL_NUM_PIXELS_ON_FAR_TILE_EDGE>>1;
		break;
	case 1:
		// Ratio 1:1
		orderX=CTileFarBank::order0;
		tileSize=NL_NUM_PIXELS_ON_FAR_TILE_EDGE;
		break;
	default:
		// no!: must be one of the previous values
		nlassert (0);
	}

	// Must have a far tile bank pointer set in the CFarTexture
	nlassert (_Bank);

	// For all the tiles in the textures
	sint nTileInPatch=0;

	// ** Fill the struct for the tile fill method for each layers
	NL3D_CComputeTileFar TileFar;
	TileFar.SrcDiffusePixels = NULL;
	TileFar.SrcAdditivePixels = NULL;
	TileFar.SrcDeltaX = 0;
	TileFar.SrcDeltaY = 0;
	TileFar.AsmMMX= false;
#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
	TileFar.AsmMMX= NLMISC::CSystemInfo::hasMMX();
#endif

	// Destination pointer

	// Destination delta
	TileFar.DstDeltaX=dstDeltaX;
	TileFar.DstDeltaY=dstDeltaY;

	// ** Build expand lightmap..
	NL3D_CExpandLightmap lightMap;

	// Fill the structure
	lightMap.MulFactor=tileSize;
	lightMap.ColorTile=&patch->TileColors[0];
	lightMap.Width=nS+1;
	lightMap.Height=nT+1;
	lightMap.StaticLightColor=patch->getZone()->getLandscape()->getStaticLight();
	lightMap.DstPixels=_LightmapExpanded;
	// Compute current TLI colors.
	patch->computeCurrentTLILightmapDiv2(_TileTLIColors);
	lightMap.TLIColor= _TileTLIColors;

	// Expand the shadowmap
	patch->unpackShadowMap (_LumelExpanded);
	lightMap.LumelTile=_LumelExpanded;

	// Expand the patch lightmap now
	NL3D_expandLightmap (&lightMap);

	// DeltaY for lightmap
	TileFar.SrcLightingDeltaY=nS*tileSize;

	// Base Dst pointer on the tile line
	uint nBaseDstTileLine=nBaseOffset;
	for (uint t=0; t<nT; t++)
	{
		// Base Dst pointer on the tile
		uint nBaseDstTilePixels=nBaseDstTileLine;

		// For each tile of the line
		for (uint s=0; s<nS; s++)
		{
			// Base pointer of the destination texture
			TileFar.DstPixels=(CRGBA*)&(getPixels()[0])+nBaseDstTilePixels;

			// Lightmap pointer
			TileFar.SrcLightingPixels=_LightmapExpanded+(s*tileSize)+(t*nS*tileSize*tileSize);

			// For each layer of the tile
			for (sint l=0; l<3; l++)
			{
				// Use of additive in this layer ?
				bool bAdditive=false;

				// Size of the edge far tile
				TileFar.Size=tileSize;

				// Get a tile element reference for this tile.
				const CTileElement &tileElm=patch->Tiles[nTileInPatch];

				// Check for 256 tiles...
				bool	is256x256;
				uint8	uvOff;
				tileElm.getTile256Info(is256x256, uvOff);

				// Get the tile number
				sint tile=tileElm.Tile[l];

				// Is the last layer ?
				bool lastLayer = ( (l == 2) || (tileElm.Tile[l+1] == NL_TILE_ELM_LAYER_EMPTY) );

				// Is an non-empty layer ?
				if (tile!=NL_TILE_ELM_LAYER_EMPTY)
				{
					// Get the read only pointer on the far tile
					const CTileFarBank::CTileFar*	pTile=_Bank->getTile (tile);

					// This pointer must not be null, else the farBank is not valid!
					if (pTile==NULL)
						nlwarning ("FarBank is not valid!");

					// If the tile exist
					if (pTile)
					{
						// Tile exist ?
						if (pTile->isFill (CTileFarBank::diffuse))
						{
							// Get rotation of the tile in this layer
							sint nRot=tileElm.getTileOrient(l);

							// Source pointer
							const CRGBA*	pSrcDiffusePixels=pTile->getPixels (CTileFarBank::diffuse, orderX);
							const CRGBA*	pSrcAdditivePixels=NULL;

							// Additive ?
							if (pTile->isFill (CTileFarBank::additive))
							{
								// Use it
								bAdditive=true;

								// Get additive pointer
								pSrcAdditivePixels=pTile->getPixels (CTileFarBank::additive, orderX);
							}

							// Source size
							sint sourceSize;

							// Source offset (for 256)
							uint sourceOffset=0;

							// 256 ?
							if (is256x256)
							{
								// On the left ?
								if (uvOff&0x02)
									sourceOffset+=tileSize;

								// On the bottom ?
								if ((uvOff==1)||(uvOff==2))
									sourceOffset+=2*tileSize*tileSize;

								// Yes, 256
								sourceSize=tileSize<<1;
							}
							else
							{
								// No, 128
								sourceSize=tileSize;
							}

							// Compute offset and deltas
							switch (nRot)
							{
							case 0:
								// Source pointers
								TileFar.SrcDiffusePixels=pSrcDiffusePixels+sourceOffset;
								TileFar.SrcAdditivePixels=pSrcAdditivePixels+sourceOffset;

								// Source delta
								TileFar.SrcDeltaX=1;
								TileFar.SrcDeltaY=sourceSize;
								break;
							case 1:
								{
									// Source pointers
									uint newOffset=sourceOffset+(tileSize-1);
									TileFar.SrcDiffusePixels=pSrcDiffusePixels+newOffset;
									TileFar.SrcAdditivePixels=pSrcAdditivePixels+newOffset;

									// Source delta
									TileFar.SrcDeltaX=sourceSize;
									TileFar.SrcDeltaY=-1;
								}
								break;
							case 2:
								{
									// Destination pointer
									uint newOffset=sourceOffset+(tileSize-1)*sourceSize+tileSize-1;
									TileFar.SrcDiffusePixels=pSrcDiffusePixels+newOffset;
									TileFar.SrcAdditivePixels=pSrcAdditivePixels+newOffset;

									// Source delta
									TileFar.SrcDeltaX=-1;
									TileFar.SrcDeltaY=-sourceSize;
								}
								break;
							case 3:
								{
									// Destination pointer
									uint newOffset=sourceOffset+(tileSize-1)*sourceSize;
									TileFar.SrcDiffusePixels=pSrcDiffusePixels+newOffset;
									TileFar.SrcAdditivePixels=pSrcAdditivePixels+newOffset;

									// Source delta
									TileFar.SrcDeltaX=-sourceSize;
									TileFar.SrcDeltaY=1;
								}
								break;
							}

							// *** Draw the layer

							// Alpha layer ?
							if (l>0)
							{
								// Additive layer ?
								if (bAdditive && lastLayer)
									NL3D_drawFarTileInFarTextureAdditiveAlpha (&TileFar);
								else	// No additive layer
									NL3D_drawFarTileInFarTextureAlpha (&TileFar);
							}
							else	// no alpha
							{
								// Additive layer ?
								if (bAdditive && lastLayer)
									NL3D_drawFarTileInFarTextureAdditive (&TileFar);
								else	// No additive layer
									NL3D_drawFarTileInFarTexture (&TileFar);
							}
						}
					}
				}
				else
					// Stop, no more layer
					break;
			}

			// Next tile
			nTileInPatch++;

			// Next tile on the line
			nBaseDstTilePixels+=dstDeltaX*tileSize;
		}

		// Next line of tiles
		nBaseDstTileLine+=dstDeltaY*tileSize;
	}

}

} // NL3D


// ***************************************************************************
// ***************************************************************************
// NL3D_ExpandLightmap. C and Asm Part
// ***************************************************************************
// ***************************************************************************

#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)


// EMMS called not in __asm block.
#  pragma warning (disable : 4799)


// ***************************************************************************
inline	void	NL3D_asmEndMMX()
{
	__asm
	{
		// close MMX computation
		emms
	}
}


// ***************************************************************************
/** Expand a line of color with MMX.
 *	NB: start to write at pixel 1.
 */
inline	void	NL3D_asmExpandLineColor565(const uint16 *src, CRGBA *dst, uint du, uint len)
{
	static	uint64 blank = 0;
	static	uint64 cF800 = INT64_CONSTANT (0x0000F8000000F800);
	static	uint64 cE000 = INT64_CONSTANT (0x0000E0000000E000);
	static	uint64 c07E0 = INT64_CONSTANT (0x000007E0000007E0);
	static	uint64 c0600 = INT64_CONSTANT (0x0000060000000600);
	static	uint64 c001F = INT64_CONSTANT (0x0000001F0000001F);
	static	uint64 c001C = INT64_CONSTANT (0x0000001C0000001C);
	if(len==0)
		return;


	// Loop for pix.
	__asm
	{
		movq	mm7, blank

		// start at pixel 1 => increment dst, and start u= du
		mov		esi, src
		mov		edi, dst
		add		edi, 4
		mov		ecx, len
		mov		edx, du

		// Loop
	myLoop:


		// Read 565 colors
		//----------
		// index u.
		mov		ebx, edx
		shr		ebx, 8

		// pack the 2 colors in eax: // Hedx= color0, Ledx= color1
		xor		eax, eax			// avoid partial stall.
		mov		ax, [esi + ebx*2]
		shl		eax, 16
		mov		ax, [esi + ebx*2 +2]

		// store and unpack in mm2: Hmm2= color0, Lmm2= color1
		movd	mm2, eax
		punpcklwd	mm2, mm7

		// reset accumulator mm3 to black
		movq	mm3, mm7

		// Expand 565 to 888: color0 and color1 in parrallel
		// R
		movq	mm0, mm2
		movq	mm1, mm2
		pand	mm0, cF800
		pand	mm1, cE000
		psrld	mm0, 8
		psrld	mm1, 13
		por		mm3, mm0
		por		mm3, mm1
		// G
		movq	mm0, mm2
		movq	mm1, mm2
		pand	mm0, c07E0
		pand	mm1, c0600
		pslld	mm0, 5
		psrld	mm1, 1
		por		mm3, mm0
		por		mm3, mm1
		// B
		movq	mm0, mm2
		movq	mm1, mm2
		pand	mm0, c001F
		pand	mm1, c001C
		pslld	mm0, 19
		pslld	mm1, 14
		por		mm3, mm0
		por		mm3, mm1

		// unpack mm3 quad to mm0=color0 and mm1=color1.
		movq	mm0, mm3
		movq	mm1, mm3
		psrlq	mm0, 32


		// Blend.
		//----------
		// blend factors
		mov		ebx, edx
		mov		eax, 256

		and		ebx, 0xFF
		sub		eax, ebx

		movd	mm2, ebx		// mm2= factor
		movd	mm3, eax		// mm3= 1-factor
		// replicate to the	4 words.
		punpckldq	mm2, mm2	// mm2= 0000 00AA 0000 00AA
		punpckldq	mm3, mm3	// mm3= 0000 00AA 0000 00AA
		packssdw	mm2, mm2	// mm2= 00AA 00AA 00AA 00AA
		packssdw	mm3, mm3	// mm3= 00AA 00AA 00AA 00AA

		// mul
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm3	// color0*(1-factor)
		pmullw		mm1, mm2	// color1*factor
		// add, and unpack
		paddusw		mm0, mm1
		psrlw       mm0, 8
		packuswb    mm0, mm0

		// store
		movd        [edi], mm0


		// next pix
		add	edx, du
		add	edi, 4
		dec ecx
		jnz myLoop
	}
}


// ***************************************************************************
/** Expand a line of color with MMX.
 *	NB: start to write at pixel 1.
 */
inline	void	NL3D_asmExpandLineColor8888(const CRGBA *src, CRGBA *dst, uint du, uint len)
{
	static	uint64 blank = 0;
	if(len==0)
		return;


	// Loop for pix.
	__asm
	{
		movq	mm7, blank

		// start at pixel 1 => increment dst, and start u= du
		mov		esi, src
		mov		edi, dst
		add		edi, 4
		mov		ecx, len
		mov		edx, du

		// Loop
	myLoop:


		// Read 8888 colors
		//----------
		// index u.
		mov		ebx, edx
		shr		ebx, 8

		// read the 2 colors: mm0= color0, mm1= color1
		movd	mm0 , [esi + ebx*4]
		movd	mm1 , [esi + ebx*4 + 4]


		// Blend.
		//----------
		// blend factors
		mov		ebx, edx
		mov		eax, 256

		and		ebx, 0xFF
		sub		eax, ebx

		movd	mm2, ebx		// mm2= factor
		movd	mm3, eax		// mm3= 1-factor
		// replicate to the	4 words.
		punpckldq	mm2, mm2	// mm2= 0000 00AA 0000 00AA
		punpckldq	mm3, mm3	// mm3= 0000 00AA 0000 00AA
		packssdw	mm2, mm2	// mm2= 00AA 00AA 00AA 00AA
		packssdw	mm3, mm3	// mm3= 00AA 00AA 00AA 00AA

		// mul
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm3	// color0*(1-factor)
		pmullw		mm1, mm2	// color1*factor
		// add, and unpack
		paddusw		mm0, mm1
		psrlw       mm0, 8
		packuswb    mm0, mm0

		// store
		movd        [edi], mm0


		// next pix
		add	edx, du
		add	edi, 4
		dec ecx
		jnz myLoop
	}
}


// ***************************************************************************
/** Blend 2 lines of color into one line.
 *	NB: start at pix 0 here
 */
inline	void	NL3D_asmBlendLines(CRGBA *dst, const CRGBA *src0, const CRGBA *src1, uint index, uint len)
{
	static	uint64 blank = 0;
	if(len==0)
		return;


	// Loop for pix.
	__asm
	{
		movq	mm7, blank

		// read the factor and expand it to 4 words.
		mov		ebx, index
		mov		eax, 256
		and		ebx, 0xFF
		sub		eax, ebx
		movd	mm2, ebx		// mm2= factor
		movd	mm3, eax		// mm3= 1-factor
		punpckldq	mm2, mm2	// mm2= 0000 00AA 0000 00AA
		punpckldq	mm3, mm3	// mm3= 0000 00AA 0000 00AA
		packssdw	mm2, mm2	// mm2= 00AA 00AA 00AA 00AA
		packssdw	mm3, mm3	// mm3= 00AA 00AA 00AA 00AA

		// setup ptrs
		mov		esi, src0
		mov		edx, src1
		sub		edx, esi	// difference between 2 src
		mov		edi, dst
		mov		ecx, len

		// Loop
	myLoop:

		// Read
		movd	mm0, [esi]
		movd	mm1, [esi+edx]

		// mul
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm3	// color0*(1-factor)
		pmullw		mm1, mm2	// color1*factor
		// add, and unpack
		paddusw		mm0, mm1
		psrlw       mm0, 8
		packuswb    mm0, mm0

		// store
		movd        [edi], mm0


		// next pix
		add	esi, 4
		add	edi, 4
		dec ecx
		jnz myLoop
	}
}


// ***************************************************************************
/**	Lightmap Combining for Far level 2 (farthest)
 *	Average 16 lumels, and deals with UserColor and TLI
 */
static void		NL3D_asmAssembleShading1x1(const uint8 *lumels, const CRGBA *colorMap,
	const CRGBA *srcTLIs, const CRGBA *srcUSCs, CRGBA *dst, uint lineWidth, uint nbTexel)
{
	static	uint64 blank = 0;
	if(nbTexel==0)
		return;

	// local var
	uint	offsetTLIs= ((uint)srcTLIs-(uint)dst);
	uint	offsetUSCs= ((uint)srcUSCs-(uint)dst);

	// Loop for pix.
	__asm
	{
		movq		mm7, blank

		// setup ptrs
		mov			esi, lumels
		mov			edi, dst
		mov			ecx, nbTexel

		// Loop
	myLoop:

		// Average shade part
		//------------
		mov			ebx, colorMap
		mov			edx, lineWidth

		// read and accumulate shade
		xor			eax,eax			// avoid partial stall
		// add with line 0
		mov			al, [esi + 0]
		add			al, [esi + 1]
		adc			ah, 0
		add			al, [esi + 2]
		adc			ah, 0
		add			al, [esi + 3]
		adc			ah, 0
		// add with line 1
		add			al, [esi + edx + 0]
		adc			ah, 0
		add			al, [esi + edx + 1]
		adc			ah, 0
		add			al, [esi + edx + 2]
		adc			ah, 0
		add			al, [esi + edx + 3]
		adc			ah, 0
		// add with line 2
		add			al, [esi + edx*2 + 0]
		adc			ah, 0
		add			al, [esi + edx*2 + 1]
		adc			ah, 0
		add			al, [esi + edx*2 + 2]
		adc			ah, 0
		add			al, [esi + edx*2 + 3]
		adc			ah, 0
		// add with line 3
		lea			edx, [edx + edx*2]
		add			al, [esi + edx + 0]
		adc			ah, 0
		add			al, [esi + edx + 1]
		adc			ah, 0
		add			al, [esi + edx + 2]
		adc			ah, 0
		add			al, [esi + edx + 3]
		adc			ah, 0
		// average
		shr			eax, 4

		// convert to RGBA from the color Map
		movd		mm0, [ebx + eax*4]

		// Assemble part
		//------------
		mov			edx, offsetTLIs
		mov			ebx, offsetUSCs

		// Add with TLI, and clamp.
		paddusb		mm0, [edi + edx]

		// mul with USC
		movd		mm1, [edi + ebx]
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm1
		// unpack
		psrlw       mm0, 7
		packuswb    mm0, mm0

		// store
		movd        [edi], mm0


		// next pix
		add			esi, 4		// skip 4 lumels
		add			edi, 4		// next texel
		dec			ecx
		jnz			myLoop
	}
}


// ***************************************************************************
/**	Lightmap Combining for Far level 1 (middle)
 *	Average 4 lumels, and deals with UserColor and TLI
 */
static void		NL3D_asmAssembleShading2x2(const uint8 *lumels, const CRGBA *colorMap,
	const CRGBA *srcTLIs, const CRGBA *srcUSCs, CRGBA *dst, uint lineWidth, uint nbTexel)
{
	static	uint64 blank = 0;
	if(nbTexel==0)
		return;

	// local var
	uint	offsetTLIs= ((uint)srcTLIs-(uint)dst);
	uint	offsetUSCs= ((uint)srcUSCs-(uint)dst);

	// Loop for pix.
	__asm
	{
		movq		mm7, blank

		// setup ptrs
		mov			esi, lumels
		mov			edi, dst
		mov			ecx, nbTexel

		// Loop
	myLoop:

		// Average shade part
		//------------
		mov			ebx, colorMap
		mov			edx, lineWidth

		// read and accumulate shade
		xor			eax,eax			// avoid partial stall
		mov			al, [esi]		// read lumel
		// add with nbors
		add			al, [esi + 1]
		adc			ah, 0
		add			al, [esi + edx]
		adc			ah, 0
		add			al, [esi + edx + 1]
		adc			ah, 0
		// average
		shr			eax, 2

		// convert to RGBA from the color Map
		movd		mm0, [ebx + eax*4]

		// Assemble part
		//------------
		mov			edx, offsetTLIs
		mov			ebx, offsetUSCs

		// Add with TLI, and clamp.
		paddusb		mm0, [edi + edx]

		// mul with USC
		movd		mm1, [edi + ebx]
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm1
		// unpack
		psrlw       mm0, 7
		packuswb    mm0, mm0

		// store
		movd        [edi], mm0


		// next pix
		add			esi, 2		// skip 2 lumels
		add			edi, 4		// next texel
		dec			ecx
		jnz			myLoop
	}
}


// ***************************************************************************
#  pragma warning (disable : 4731)			// frame pointer register 'ebp' modified by inline assembly code
/**	Lightmap Combining for Far level 0 (nearest)
 *	read 1 lumel, and deals with UserColor and TLI
 */
static void		NL3D_asmAssembleShading4x4(const uint8 *lumels, const CRGBA *colorMap,
	const CRGBA *srcTLIs, const CRGBA *srcUSCs, CRGBA *dst, uint nbTexel)
{
	static	uint64 blank = 0;
	if(nbTexel==0)
		return;

	// Loop for pix.
	__asm
	{
		// Use ebp as a register for faster access...
		push		ebp

		movq		mm7, blank

		// setup ptrs
		mov			esi, lumels
		mov			edi, dst
		mov			edx, srcTLIs
		sub			edx, edi	// difference src and dest
		mov			ebx, srcUSCs
		sub			ebx, edi	// difference src and dest
		mov			ecx, nbTexel

		// set ebp after reading locals...
		mov			ebp, colorMap

		// Loop
	myLoop:

		// read shade RGBA into the color Map
		xor			eax,eax			// avoid partial stall
		mov			al,[esi]		// read lumel
		movd		mm0, [ebp + eax*4]

		// Add with TLI, and clamp.
		paddusb		mm0, [edi + edx]

		// mul with USC
		movd		mm1, [edi + ebx]
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm1
		// unpack
		psrlw       mm0, 7
		packuswb    mm0, mm0

		// store
		movd        [edi], mm0


		// next pix
		add			esi, 1		// next lumel
		add			edi, 4		// next texel
		dec			ecx
		jnz			myLoop

		// restore
		pop			ebp
	}

}

#  pragma warning (default : 4731)			// frame pointer register 'ebp' modified by inline assembly code


#else // NL_OS_WINDOWS

// Dummy for non-windows platforms
inline	void	NL3D_asmEndMMX() {}
inline	void	NL3D_asmExpandLineColor565(const uint16 *src, CRGBA *dst, uint du, uint len) {}
inline	void	NL3D_asmExpandLineColor8888(const CRGBA *src, CRGBA *dst, uint du, uint len) {}
inline	void	NL3D_asmBlendLines(CRGBA *dst, const CRGBA *src0, const CRGBA *src1, uint index, uint len) {}
static void		NL3D_asmAssembleShading1x1(const uint8 *lumels, const CRGBA *colorMap,
	const CRGBA *srcTLIs, const CRGBA *srcUSCs, CRGBA *dst, uint lineWidth, uint nbTexel)
{
}
static void		NL3D_asmAssembleShading2x2(const uint8 *lumels, const CRGBA *colorMap,
	const CRGBA *srcTLIs, const CRGBA *srcUSCs, CRGBA *dst, uint lineWidth, uint nbTexel)
{
}
static void		NL3D_asmAssembleShading4x4(const uint8 *lumels, const CRGBA *colorMap,
	const CRGBA *srcTLIs, const CRGBA *srcUSCs, CRGBA *dst, uint nbTexel)
{
}

#endif // NL_OS_WINDOWS


// ***************************************************************************
extern "C" void NL3D_expandLightmap (const NL3D_CExpandLightmap* pLightmap)
{
	bool	asmMMX= false;
#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)
	asmMMX= CSystemInfo::hasMMX();
	// A CTileColor must be a 565 only.
	nlassert(sizeof(CTileColor)==2);
#endif

	// Expanded width
	uint dstWidth=(pLightmap->Width-1)*pLightmap->MulFactor;
	uint dstHeight=(pLightmap->Height-1)*pLightmap->MulFactor;

	// *** First expand user color and TLI colors
	// First pass, expand on U
	static CRGBA expandedUserColorLine[ (NL_MAX_TILES_BY_PATCH_EDGE+1)*
		(NL_MAX_TILES_BY_PATCH_EDGE+1)*NL_LUMEL_BY_TILE ];
	static CRGBA expandedTLIColorLine[ (NL_MAX_TILES_BY_PATCH_EDGE+1)*
		(NL_MAX_TILES_BY_PATCH_EDGE+1)*NL_LUMEL_BY_TILE ];
	// Second pass, expand on V.
	static CRGBA expandedUserColor[ (NL_MAX_TILES_BY_PATCH_EDGE+1)*NL_LUMEL_BY_TILE *
		(NL_MAX_TILES_BY_PATCH_EDGE+1)*NL_LUMEL_BY_TILE ];
	static CRGBA expandedTLIColor[ (NL_MAX_TILES_BY_PATCH_EDGE+1)*NL_LUMEL_BY_TILE *
		(NL_MAX_TILES_BY_PATCH_EDGE+1)*NL_LUMEL_BY_TILE ];


	// ** Expand on U
	//=========
	uint u, v;

	// Expansion factor
	uint expandFactor=((pLightmap->Width-1)<<8)/(dstWidth-1);

	// Destination  pointer
	CRGBA *expandedUserColorLinePtr= expandedUserColorLine;
	CRGBA *expandedTLIColorLinePtr= expandedTLIColorLine;

	// Source pointer
	const NL3D::CTileColor	*colorTilePtr=pLightmap->ColorTile;
	const NLMISC::CRGBA		*colorTLIPtr= pLightmap->TLIColor;

	// Go for U
	for (v=0; v<pLightmap->Height; v++)
	{
		// First pixel
		expandedUserColorLinePtr[0].set565 (colorTilePtr[0].Color565);
		expandedTLIColorLinePtr[0]= colorTLIPtr[0];

		// MMX implementation.
		//-------------
		if(asmMMX)
		{
			NL3D_asmExpandLineColor565(&colorTilePtr->Color565, expandedUserColorLinePtr, expandFactor, dstWidth-2);
			NL3D_asmExpandLineColor8888(colorTLIPtr, expandedTLIColorLinePtr, expandFactor, dstWidth-2);
		}
		// C implementation
		//-------------
		else
		{
			// Index next pixel
			uint srcIndexPixel=expandFactor;

			for (u=1; u<dstWidth-1; u++)
			{
				// Check
				nlassert ( (u+v*dstWidth) < (sizeof(expandedUserColorLine)/sizeof(CRGBA)) );

				// Color index
				uint srcIndex=srcIndexPixel>>8;
				//nlassert (srcIndex>=0); // uint => always >= 0
				nlassert (srcIndex<pLightmap->Width-1);

				// Compute current color
				CRGBA color0;
				CRGBA color1;
				color0.A = 255;
				color0.set565 (colorTilePtr[srcIndex].Color565);
				color1.A = 255;
				color1.set565 (colorTilePtr[srcIndex+1].Color565);
				expandedUserColorLinePtr[u].blendFromui (color0, color1, srcIndexPixel&0xff);
				// Compute current TLI color
				color0= colorTLIPtr[srcIndex];
				color1= colorTLIPtr[srcIndex+1];
				expandedTLIColorLinePtr[u].blendFromui (color0, color1, srcIndexPixel&0xff);

				// Next index
				srcIndexPixel+=expandFactor;
			}
		}

		// Last pixel
		expandedUserColorLinePtr[dstWidth-1].set565 (colorTilePtr[pLightmap->Width-1].Color565);
		expandedTLIColorLinePtr[dstWidth-1]= colorTLIPtr[pLightmap->Width-1];

		// Next line
		expandedUserColorLinePtr+= dstWidth;
		expandedTLIColorLinePtr+= dstWidth;
		colorTilePtr+=pLightmap->Width;
		colorTLIPtr+=pLightmap->Width;
	}

	// stop MMX if used
	if(asmMMX)
		NL3D_asmEndMMX();

	// ** Expand on V
	//=========

	// Expansion factor
	expandFactor=((pLightmap->Height-1)<<8)/(dstHeight-1);

	// Destination  pointer
	CRGBA *expandedUserColorPtr= expandedUserColor;
	CRGBA *expandedTLIColorPtr= expandedTLIColor;

	// Src pointer
	expandedUserColorLinePtr= expandedUserColorLine;
	expandedTLIColorLinePtr= expandedTLIColorLine;

	// Copy first row
	memcpy(expandedUserColorPtr, expandedUserColorLinePtr, dstWidth*sizeof(CRGBA));
	memcpy(expandedTLIColorPtr, expandedTLIColorLinePtr, dstWidth*sizeof(CRGBA));

	// Next line
	expandedUserColorPtr+=dstWidth;
	expandedTLIColorPtr+=dstWidth;

	// Index next pixel
	uint indexPixel=expandFactor;

	// Go for V
	for (v=1; v<dstHeight-1; v++)
	{
		// Color index
		uint index=indexPixel>>8;

		// Source pointer
		CRGBA *colorTilePtr0= expandedUserColorLine + index*dstWidth;
		CRGBA *colorTilePtr1= expandedUserColorLine + (index+1)*dstWidth;
		CRGBA *colorTLIPtr0= expandedTLIColorLine + index*dstWidth;
		CRGBA *colorTLIPtr1= expandedTLIColorLine + (index+1)*dstWidth;

		// MMX implementation.
		//-------------
		if(asmMMX)
		{
			NL3D_asmBlendLines(expandedUserColorPtr, colorTilePtr0, colorTilePtr1, indexPixel, dstWidth);
			NL3D_asmBlendLines(expandedTLIColorPtr, colorTLIPtr0, colorTLIPtr1, indexPixel, dstWidth);
		}
		// C implementation
		//-------------
		else
		{
			// Copy the row
			for (u=0; u<dstWidth; u++)
			{
				expandedUserColorPtr[u].blendFromui (colorTilePtr0[u], colorTilePtr1[u], indexPixel&0xff);
				expandedTLIColorPtr[u].blendFromui (colorTLIPtr0[u], colorTLIPtr1[u],  indexPixel&0xff);
			}
		}

		// Next index
		indexPixel+=expandFactor;

		// Next line
		expandedUserColorPtr+=dstWidth;
		expandedTLIColorPtr+=dstWidth;
	}

	// stop MMX if used
	if(asmMMX)
		NL3D_asmEndMMX();

	// Last row
	// Destination  pointer
	expandedUserColorPtr= expandedUserColor + dstWidth*(dstHeight-1);
	expandedTLIColorPtr= expandedTLIColor + dstWidth*(dstHeight-1);
	// Src pointer
	expandedUserColorLinePtr= expandedUserColorLine + dstWidth*(pLightmap->Height-1);
	expandedTLIColorLinePtr= expandedTLIColorLine + dstWidth*(pLightmap->Height-1);

	// Copy last row
	memcpy(expandedUserColorPtr, expandedUserColorLinePtr, dstWidth*sizeof(CRGBA));
	memcpy(expandedTLIColorPtr, expandedTLIColorLinePtr, dstWidth*sizeof(CRGBA));

	// *** Now combine with shading
	//=========

	// Switch to the optimal method for each expansion value
	switch (pLightmap->MulFactor)
	{
	case 1:
		{
			// Make 4x4 -> 1x1 blend
			CRGBA *lineUSCPtr= expandedUserColor;
			CRGBA *lineTLIPtr= expandedTLIColor;
			CRGBA *lineDestPtr=pLightmap->DstPixels;
			const uint8 *lineLumelPtr=pLightmap->LumelTile;
			uint lineWidth=dstWidth<<2;
			uint lineWidthx2=lineWidth<<1;
			uint lineWidthx3=lineWidthx2+lineWidth;
			uint lineWidthx4=lineWidth<<2;

			// For each line
			for (v=0; v<dstHeight; v++)
			{
				// MMX implementation.
				//-------------
				if(asmMMX)
				{
					NL3D_asmAssembleShading1x1(lineLumelPtr, pLightmap->StaticLightColor, lineTLIPtr, lineUSCPtr, lineDestPtr,
						lineWidth, dstWidth);
				}
				// C implementation
				//-------------
				else
				{
					// For each lumel block
					for (u=0; u<dstWidth; u++)
					{
						// index
						uint lumelIndex=u<<2;

						// Shading is filtred
						uint shading=
							 ((uint)lineLumelPtr[lumelIndex]+(uint)lineLumelPtr[lumelIndex+1]+(uint)lineLumelPtr[lumelIndex+2]+(uint)lineLumelPtr[lumelIndex+3]
							+(uint)lineLumelPtr[lumelIndex+lineWidth]+(uint)lineLumelPtr[lumelIndex+1+lineWidth]+(uint)lineLumelPtr[lumelIndex+2+lineWidth]+(uint)lineLumelPtr[lumelIndex+3+lineWidth]
							+(uint)lineLumelPtr[lumelIndex+lineWidthx2]+(uint)lineLumelPtr[lumelIndex+1+lineWidthx2]+(uint)lineLumelPtr[lumelIndex+2+lineWidthx2]+(uint)lineLumelPtr[lumelIndex+3+lineWidthx2]
							+(uint)lineLumelPtr[lumelIndex+lineWidthx3]+(uint)lineLumelPtr[lumelIndex+1+lineWidthx3]+(uint)lineLumelPtr[lumelIndex+2+lineWidthx3]+(uint)lineLumelPtr[lumelIndex+3+lineWidthx3]
							)>>4;

						// Add shading with TLI color.
						CRGBA	col;
						col.addRGBOnly(pLightmap->StaticLightColor[shading], lineTLIPtr[u]);

						// Mul by the userColor
						lineDestPtr[u].modulateFromColorRGBOnly(col, lineUSCPtr[u]);

						lineDestPtr[u].R = min(((uint)lineDestPtr[u].R)*2, 255U);
						lineDestPtr[u].G = min(((uint)lineDestPtr[u].G)*2, 255U);
						lineDestPtr[u].B = min(((uint)lineDestPtr[u].B)*2, 255U);
					}
				}

				// Next line
				lineUSCPtr+=dstWidth;
				lineTLIPtr+=dstWidth;
				lineDestPtr+=dstWidth;
				lineLumelPtr+=lineWidthx4;
			}
			break;
		}
	case 2:
		{
			// Make 2x2 -> 1x1 blend
			CRGBA *lineUSCPtr= expandedUserColor;
			CRGBA *lineTLIPtr= expandedTLIColor;
			CRGBA *lineDestPtr=pLightmap->DstPixels;
			const uint8 *lineLumelPtr=pLightmap->LumelTile;
			uint lineWidth=dstWidth*2;
			uint lineWidthx2=lineWidth<<1;

			// For each line
			for (v=0; v<dstHeight; v++)
			{
				// MMX implementation.
				//-------------
				if(asmMMX)
				{
					NL3D_asmAssembleShading2x2(lineLumelPtr, pLightmap->StaticLightColor, lineTLIPtr, lineUSCPtr, lineDestPtr,
						lineWidth, dstWidth);
				}
				// C implementation
				//-------------
				else
				{
					// For each lumel block
					for (u=0; u<dstWidth; u++)
					{
						// index
						uint lumelIndex=u<<1;

						// Shading is filtred
						uint shading=
							((uint)lineLumelPtr[lumelIndex]+(uint)lineLumelPtr[lumelIndex+1]+(uint)lineLumelPtr[lumelIndex+lineWidth]+(uint)lineLumelPtr[lumelIndex+1+lineWidth])>>2;

						// Add shading with TLI color.
						CRGBA	col;
						col.addRGBOnly(pLightmap->StaticLightColor[shading], lineTLIPtr[u]);

						// Mul by the userColor
						lineDestPtr[u].modulateFromColorRGBOnly(col, lineUSCPtr[u]);

						lineDestPtr[u].R = min(((uint)lineDestPtr[u].R)*2, 255U);
						lineDestPtr[u].G = min(((uint)lineDestPtr[u].G)*2, 255U);
						lineDestPtr[u].B = min(((uint)lineDestPtr[u].B)*2, 255U);
					}
				}

				// Next line
				lineUSCPtr+=dstWidth;
				lineTLIPtr+=dstWidth;
				lineDestPtr+=dstWidth;
				lineLumelPtr+=lineWidthx2;
			}
			break;
		}

	case 4:
			// Make copy
			CRGBA *lineUSCPtr= expandedUserColor;
			CRGBA *lineTLIPtr= expandedTLIColor;
			CRGBA *lineDestPtr=pLightmap->DstPixels;
			const uint8 *lineLumelPtr=pLightmap->LumelTile;
			uint nbTexel=dstWidth*dstHeight;

			// MMX implementation.
			//-------------
			if(asmMMX)
			{
				NL3D_asmAssembleShading4x4(lineLumelPtr, pLightmap->StaticLightColor, lineTLIPtr, lineUSCPtr, lineDestPtr,
					nbTexel);
			}
			// C implementation
			//-------------
			else
			{
				// For each pixel
				for (u=0; u<nbTexel; u++)
				{
					// Shading is filtred
					uint shading=lineLumelPtr[u];

					// Add shading with TLI color.
					CRGBA	col;
					col.addRGBOnly(pLightmap->StaticLightColor[shading], lineTLIPtr[u]);

					// Mul by the userColor
					lineDestPtr[u].modulateFromColorRGBOnly(col, lineUSCPtr[u]);

					lineDestPtr[u].R = min(((uint)lineDestPtr[u].R)*2, 255U);
					lineDestPtr[u].G = min(((uint)lineDestPtr[u].G)*2, 255U);
					lineDestPtr[u].B = min(((uint)lineDestPtr[u].B)*2, 255U);
				}
			}
			break;
	}

	// stop MMX if used
	if(asmMMX)
		NL3D_asmEndMMX();

}


// ***************************************************************************
// ***************************************************************************
// NL3D_drawFarTileInFar*. C and Asm Part
// ***************************************************************************
// ***************************************************************************


#if defined(NL_OS_WINDOWS) && !defined(NL_NO_ASM)


// ***************************************************************************
inline	void	NL3D_asmModulateLineColors(CRGBA *dst, const CRGBA *src0, const CRGBA *src1,
	uint len, uint	src0DeltaX, uint dstDeltaX)
{
	static	uint64	blank= 0;
	if(len==0)
		return;

	__asm
	{
		movq		mm7, blank

		mov			esi, src0	// esi point to src Pixels
		mov			edx, src1	// edx point to src lighting pixels
		mov			edi, dst
		mov			ecx, len
		// compute increments for esi and edi
		mov			eax, src0DeltaX
		mov			ebx, dstDeltaX
		sal			eax, 2
		sal			ebx, 2

	myLoop:
		// read colors
		movd		mm0, [esi]
		movd		mm1, [edx]

		// mul mm0 and mm1
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm1
		psrlw       mm0, 8
		// pack
		packuswb    mm0, mm0

		// out
		movd		[edi], mm0

		// increment
		add			esi, eax
		add			edi, ebx
		add			edx, 4
		dec			ecx
		jnz			myLoop
	}
}


// ***************************************************************************
inline	void	NL3D_asmModulateAndBlendLineColors(CRGBA *dst, const CRGBA *src0, const CRGBA *src1,
	uint len, uint	src0DeltaX, uint dstDeltaX)
{
	static	uint64	blank= 0;
	static	uint64	one= INT64_CONSTANT (0x0100010001000100);
	if(len==0)
		return;

	__asm
	{
		movq		mm7, blank
		movq		mm6, one

		mov			esi, src0	// esi point to src Pixels
		mov			edx, src1	// edx point to src lighting pixels
		mov			edi, dst
		mov			ecx, len
		// compute increments for esi and edi
		mov			eax, src0DeltaX
		mov			ebx, dstDeltaX
		sal			eax, 2
		sal			ebx, 2

	myLoop:
		// read colors
		movd		mm0, [esi]
		movd		mm1, [edx]

		// save and unpack Alpha. NB: ABGR
		movq		mm2, mm0
		psrld		mm2, 24		// mm2= 0000 0000 0000 00AA
		punpckldq	mm2, mm2	// mm2= 0000 00AA 0000 00AA
		packssdw	mm2, mm2	// mm2= 00AA 00AA 00AA 00AA
		// negate with 256.
		movq		mm3, mm6
		psubusw		mm3, mm2

		// mul mm0 and mm1
		punpcklbw	mm0, mm7
		punpcklbw	mm1, mm7
		pmullw		mm0, mm1
		psrlw       mm0, 8

		// Alpha Blend with mm3 and mm2
		movd		mm1, [edi]	// read dest
		punpcklbw	mm1, mm7
		pmullw		mm0, mm2	// mm0= srcColor*A
		pmullw		mm1, mm3	// mm1= dstColor*(1-A)

		// add and pack
		paddusw		mm0, mm1
		psrlw       mm0, 8
		packuswb    mm0, mm0

		// out
		movd		[edi], mm0

		// increment
		add			esi, eax
		add			edi, ebx
		add			edx, 4
		dec			ecx
		jnz			myLoop
	}
}


#else // NL_OS_WINDOWS

// Dummy for non-windows platforms
inline	void	NL3D_asmModulateLineColors(CRGBA *dst, const CRGBA *src0, const CRGBA *src1,
	uint len, uint	src0DeltaX, uint dstDeltaX)
{
}
inline	void	NL3D_asmModulateAndBlendLineColors(CRGBA *dst, const CRGBA *src0, const CRGBA *src1,
	uint len, uint	src0DeltaX, uint dstDeltaX)
{
}

#endif

// ***************************************************************************
void NL3D_drawFarTileInFarTexture (const NL3D_CComputeTileFar* pTileFar)
{
	// Pointer of the Src diffuse pixels
	const CRGBA* pSrcPixels=pTileFar->SrcDiffusePixels;

	// Pointer of the Dst pixels
	const CRGBA* pSrcLightPixels=pTileFar->SrcLightingPixels;

	// Pointer of the Dst pixels
	CRGBA* pDstPixels=pTileFar->DstPixels;

	// For each pixels
	int x, y;
	for (y=0; y<pTileFar->Size; y++)
	{
		// MMX implementation
		//---------
		if(pTileFar->AsmMMX)
		{
			NL3D_asmModulateLineColors(pDstPixels, pSrcPixels, pSrcLightPixels,
				pTileFar->Size, pTileFar->SrcDeltaX, pTileFar->DstDeltaX);
		}
		// C Implementation.
		//---------
		else
		{
			// Pointer of the source line
			const CRGBA* pSrcLine=pSrcPixels;

			// Pointer of the source lighting line
			const CRGBA* pSrcLightingLine=pSrcLightPixels;

			// Pointer of the destination line
			CRGBA* pDstLine=pDstPixels;

			// For each pixels on the line
			for (x=0; x<pTileFar->Size; x++)
			{
				// Read and write a pixel
				pDstLine->R=(uint8)(((uint)pSrcLine->R*(uint)pSrcLightingLine->R)>>8);
				pDstLine->G=(uint8)(((uint)pSrcLine->G*(uint)pSrcLightingLine->G)>>8);
				pDstLine->B=(uint8)(((uint)pSrcLine->B*(uint)pSrcLightingLine->B)>>8);

				// Next pixel
				pSrcLine+=pTileFar->SrcDeltaX;
				pSrcLightingLine++;
				pDstLine+=pTileFar->DstDeltaX;
			}
		}

		// Next line
		pSrcPixels+=pTileFar->SrcDeltaY;
		pSrcLightPixels+=pTileFar->SrcLightingDeltaY;
		pDstPixels+=pTileFar->DstDeltaY;
	}

	// stop MMX if used
	if(pTileFar->AsmMMX)
		NL3D_asmEndMMX();
}


// ***************************************************************************
void NL3D_drawFarTileInFarTextureAlpha (const NL3D_CComputeTileFar* pTileFar)
{
	// Pointer of the Src pixels
	const CRGBA* pSrcPixels=pTileFar->SrcDiffusePixels;

	// Pointer of the Dst pixels
	const CRGBA* pSrcLightPixels=pTileFar->SrcLightingPixels;

	// Pointer of the Dst pixels
	CRGBA* pDstPixels=pTileFar->DstPixels;

	// Fill the buffer with layer 0
	int x, y;
	for (y=0; y<pTileFar->Size; y++)
	{
		// MMX implementation
		//---------
		if(pTileFar->AsmMMX)
		{
			NL3D_asmModulateAndBlendLineColors(pDstPixels, pSrcPixels, pSrcLightPixels,
				pTileFar->Size, pTileFar->SrcDeltaX, pTileFar->DstDeltaX);
		}
		// C Implementation.
		//---------
		else
		{
			// Pointer of the source line
			const CRGBA* pSrcLine=pSrcPixels;

			// Pointer of the source lighting line
			const CRGBA* pSrcLightingLine=pSrcLightPixels;

			// Pointer of the Dst pixels
			CRGBA* pDstLine=pDstPixels;

			// For each pixels on the line
			for (x=0; x<pTileFar->Size; x++)
			{
				// Read and write a pixel
				register uint alpha=pSrcLine->A;
				register uint oneLessAlpha=255-pSrcLine->A;
				pDstLine->R=(uint8)(((((uint)pSrcLine->R*(uint)pSrcLightingLine->R)>>8)*alpha+(uint)pDstLine->R*oneLessAlpha)>>8);
				pDstLine->G=(uint8)(((((uint)pSrcLine->G*(uint)pSrcLightingLine->G)>>8)*alpha+(uint)pDstLine->G*oneLessAlpha)>>8);
				pDstLine->B=(uint8)(((((uint)pSrcLine->B*(uint)pSrcLightingLine->B)>>8)*alpha+(uint)pDstLine->B*oneLessAlpha)>>8);

				// Next pixel
				pSrcLine+=pTileFar->SrcDeltaX;
				pSrcLightingLine++;
				pDstLine+=pTileFar->DstDeltaX;
			}
		}

		// Next line
		pSrcPixels+=pTileFar->SrcDeltaY;
		pSrcLightPixels+=pTileFar->SrcLightingDeltaY;
		pDstPixels+=pTileFar->DstDeltaY;
	}

	// stop MMX if used
	if(pTileFar->AsmMMX)
		NL3D_asmEndMMX();
}


// ***************************************************************************
// TODO: asm implementation of this function \\//
//#ifdef NL_NO_ASM
void NL3D_drawFarTileInFarTextureAdditive (const NL3D_CComputeTileFar* pTileFar)
{
	// Pointer of the Src diffuse pixels
	const CRGBA* pSrcPixels=pTileFar->SrcDiffusePixels;

	// Pointer of the Src additive pixels
	const CRGBA* pSrcAddPixels=pTileFar->SrcAdditivePixels;

	// Pointer of the Dst pixels
	const CRGBA* pSrcLightPixels=pTileFar->SrcLightingPixels;

	// Pointer of the Dst pixels
	CRGBA* pDstPixels=pTileFar->DstPixels;

	// For each pixels
	int x, y;
	for (y=0; y<pTileFar->Size; y++)
	{
		// Pointer of the source line
		const CRGBA* pSrcLine=pSrcPixels;

		// Pointer of the source line
		const CRGBA* pSrcAddLine=pSrcAddPixels;

		// Pointer of the source lighting line
		const CRGBA* pSrcLightingLine=pSrcLightPixels;

		// Pointer of the destination line
		CRGBA* pDstLine=pDstPixels;

		// For each pixels on the line
		for (x=0; x<pTileFar->Size; x++)
		{
			// Read and write a pixel
			uint nTmp=(((uint)pSrcLine->R*(uint)pSrcLightingLine->R)>>8)+(uint)pSrcAddLine->R;
			if (nTmp>255)
				nTmp=255;
			pDstLine->R=(uint8)nTmp;
			nTmp=(((uint)pSrcLine->G*(uint)pSrcLightingLine->G)>>8)+(uint)pSrcAddLine->G;
			if (nTmp>255)
				nTmp=255;
			pDstLine->G=(uint8)nTmp;
			nTmp=(((uint)pSrcLine->B*(uint)pSrcLightingLine->B)>>8)+(uint)pSrcAddLine->B;
			if (nTmp>255)
				nTmp=255;
			pDstLine->B=(uint8)nTmp;

			// Next pixel
			pSrcLine+=pTileFar->SrcDeltaX;
			pSrcAddLine+=pTileFar->SrcDeltaX;
			pSrcLightingLine++;
			pDstLine+=pTileFar->DstDeltaX;
		}

		// Next line
		pSrcPixels+=pTileFar->SrcDeltaY;
		pSrcAddPixels+=pTileFar->SrcDeltaY;
		pSrcLightPixels+=pTileFar->SrcLightingDeltaY;
		pDstPixels+=pTileFar->DstDeltaY;
	}
}
//#endif // NL_NO_ASM


// ***************************************************************************
// TODO: asm implementation of this function \\//
//#ifdef NL_NO_ASM
void NL3D_drawFarTileInFarTextureAdditiveAlpha (const NL3D_CComputeTileFar* pTileFar)
{
	// Pointer of the Src pixels
	const CRGBA* pSrcPixels=pTileFar->SrcDiffusePixels;

	// Pointer of the Src pixels
	const CRGBA* pSrcAddPixels=pTileFar->SrcAdditivePixels;

	// Pointer of the Src pixels
	const CRGBA* pSrcLightPixels=pTileFar->SrcLightingPixels;

	// Pointer of the Dst pixels
	CRGBA* pDstPixels=pTileFar->DstPixels;

	// Fill the buffer with layer 0
	int x, y;
	for (y=0; y<pTileFar->Size; y++)
	{
		// Pointer of the source line
		const CRGBA* pSrcLine=pSrcPixels;

		// Pointer of the source line
		const CRGBA* pSrcAddLine=pSrcAddPixels;

		// Pointer of the source lighting line
		const CRGBA* pSrcLightingLine=pSrcLightPixels;

		// Pointer of the Dst pixels
		CRGBA* pDstLine=pDstPixels;

		// For each pixels on the line
		for (x=0; x<pTileFar->Size; x++)
		{
			// Read and write a pixel
			register uint alpha=pSrcLine->A;
			register uint oneLessAlpha=255-pSrcLine->A;

			// Read and write a pixel
			uint nTmp=(((uint)pSrcLine->R*(uint)pSrcLightingLine->R)>>8)+(uint)pSrcAddLine->R;
			if (nTmp>255)
				nTmp=255;
			pDstLine->R=(uint8)((nTmp*alpha+pDstLine->R*oneLessAlpha)>>8);
			nTmp=(((uint)pSrcLine->G*(uint)pSrcLightingLine->G)>>8)+(uint)pSrcAddLine->G;
			if (nTmp>255)
				nTmp=255;
			pDstLine->G=(uint8)((nTmp*alpha+pDstLine->G*oneLessAlpha)>>8);
			nTmp=(((uint)pSrcLine->B*(uint)pSrcLightingLine->B)>>8)+(uint)pSrcAddLine->B;
			if (nTmp>255)
				nTmp=255;
			pDstLine->B=(uint8)((nTmp*alpha+pDstLine->B*oneLessAlpha)>>8);

			// Next pixel
			pSrcLine+=pTileFar->SrcDeltaX;
			pSrcAddLine+=pTileFar->SrcDeltaX;
			pSrcLightingLine++;
			pDstLine+=pTileFar->DstDeltaX;
		}

		// Next line
		pSrcPixels+=pTileFar->SrcDeltaY;
		pSrcAddPixels+=pTileFar->SrcDeltaY;
		pSrcLightPixels+=pTileFar->SrcLightingDeltaY;
		pDstPixels+=pTileFar->DstDeltaY;
	}
}
//#endif // NL_NO_ASM