///////////////////////////////////////////////////////////////////////////////////
// File : CG_SOLVER_SSE.cpp
///////////////////////////////////////////////////////////////////////////////////
//
// LumosQuad - A Lightning Generator
// Copyright 2007
// The University of North Carolina at Chapel Hill
// 
///////////////////////////////////////////////////////////////////////////////////
//
//  This program is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation; either version 2 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
// 
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
//
//  The University of North Carolina at Chapel Hill makes no representations 
//  about the suitability of this software for any purpose. It is provided 
//  "as is" without express or implied warranty.
//
//  Permission to use, copy, modify and distribute this software and its
//  documentation for educational, research and non-profit purposes, without
//  fee, and without a written agreement is hereby granted, provided that the
//  above copyright notice and the following three paragraphs appear in all
//  copies.
//
//  THE UNIVERSITY OF NORTH CAROLINA SPECIFICALLY DISCLAIM ANY WARRANTIES,
//  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
//  FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN
//  "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA HAS NO OBLIGATION TO
//  PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
//
//  Please send questions and comments about LumosQuad to kim@cs.unc.edu.
//
///////////////////////////////////////////////////////////////////////////////////
//
//  This program uses OpenEXR, which has the following restrictions:
// 
//  Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
//  Digital Ltd. LLC
// 
//  All rights reserved.
// 
//  Redistribution and use in source and binary forms, with or without
//  modification, are permitted provided that the following conditions are
//  met:
//  *       Redistributions of source code must retain the above copyright
//  notice, this list of conditions and the following disclaimer.
//  *       Redistributions in binary form must reproduce the above
//  copyright notice, this list of conditions and the following disclaimer
//  in the documentation and/or other materials provided with the
//  distribution.
//  *       Neither the name of Industrial Light & Magic nor the names of
//  its contributors may be used to endorse or promote products derived
//  from this software without specific prior written permission. 
// 

#include "CG_SOLVER_SSE.h"

//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////

CG_SOLVER_SSE::CG_SOLVER_SSE(int maxDepth, int iterations, int digits) :
  CG_SOLVER(maxDepth, iterations, digits)
{
}

CG_SOLVER_SSE::~CG_SOLVER_SSE()
{
  if (_direction) free(_direction);
  if (_potential) free(_potential);
  if (_residual)  free(_residual);
  if (_q)         free(_q);

  _direction = NULL;
  _residual  = NULL;
  _q         = NULL;
  _potential = NULL;
}

//////////////////////////////////////////////////////////////////////
// reallocate the sse arrays
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::reallocate()
{
  if (_arraySize >= _listSize) return;
  _arraySize = _listSize * 2;

  if (_arraySize % 4)
    _arraySize += 4 - _arraySize % 4;
  
  if (_direction) free(_direction);
  if (_potential) free(_potential);
  if (_residual)  free(_residual);
  if (_q)         free(_q);

  _direction = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
  _potential = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
  _residual  = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
  _q         = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 

  return;
}

//////////////////////////////////////////////////////////////////////
// solve the linear system
//////////////////////////////////////////////////////////////////////
int CG_SOLVER_SSE::solve(list<CELL*> cells)
{
  // counters
  int x, y, index;
  list<CELL*>::iterator cellIterator;

  // i = 0
  int i = 0;

  // precalculate stencils
  calcStencils(cells);
 
  // reallocate scratch arrays if necessary
  _listSize = cells.size();
  reallocate();
  wipeSSE(_potential);
  wipeSSE(_direction);
  wipeSSE(_residual);
  wipeSSE(_q);
  
  // compute a new lexicographical order
  cellIterator = cells.begin();
  for (x = 0; x < _listSize; x++, cellIterator++)
  {
    CELL* cell = *cellIterator;
    cell->index = x;
    _potential[x] = cell->potential;
  }

  // r = b - Ax
  calcResidual(cells);

  // d = r
  copySSE(_direction, _residual);
  
  // deltaNew = r^T r
  float deltaNew = dotSSE(_residual, _residual);

  // delta0 = deltaNew
  float delta0 = deltaNew;

  // While deltaNew > (eps^2) * delta0
  float eps  = pow(10.0f, (float)-_digits);
  float maxR = 2.0f * eps;
  while ((i < _iterations) && (maxR > eps))
  {
    // q = Ad
    cellIterator = cells.begin();
    for (y = 0; y < _listSize; y++, cellIterator++)
    {
      CELL* currentCell = *cellIterator;
      CELL** neighbors = currentCell->neighbors;
      float* stencil = currentCell->stencil;
     
      float neighborSum = 0.0f;
      for (int x = 0; x < 8; x++)
      {
        if (neighbors[x])
          neighborSum += _direction[neighbors[x]->index] * stencil[x];
      }
      _q[y] = -neighborSum + _direction[y] * currentCell->stencil[8];
    }

    // alpha = deltaNew / (transpose(d) * q)
    float alpha = dotSSE(_q, _direction);
    if (fabs(alpha) > 0.0f)
      alpha = deltaNew / alpha;

    // x = x + alpha * d
    saxpySSE(alpha, _direction, _potential);

    // r = r - alpha * q
    saxpySSE(-alpha, _q, _residual);
    maxR = maxSSE(_residual);
    
    // deltaOld = deltaNew
    float deltaOld = deltaNew;

    // deltaNew = transpose(r) * r
    deltaNew = dotSSE(_residual, _residual);

    // beta = deltaNew / deltaOld
    float beta = deltaNew / deltaOld;

    // d = r + beta * d
    saypxSSE(beta, _residual, _direction);

    // i = i + 1
    i++;
  }

  // copy back into the tree
  cellIterator = cells.begin();
  for (x = 0; x < _listSize; x++, cellIterator++)
    (*cellIterator)->potential = _potential[x];

  return i;
}

//////////////////////////////////////////////////////////////////////
// dot product of two vectors
//////////////////////////////////////////////////////////////////////
float CG_SOLVER_SSE::dotSSE(float* x, float* y)
{
  __m128 sum = _mm_set_ps1(0.0f);
  __m128* xSSE = (__m128*)x;
  __m128* ySSE = (__m128*)y;
  __m128 temp;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    temp = _mm_mul_ps(*xSSE, *ySSE);
    sum  = _mm_add_ps(sum, temp);
    xSSE++;
    ySSE++;
  }
  union u {
    __m128 m;
    float f[4];
  } extract;
  extract.m = sum;
  return extract.f[0] + extract.f[1] + extract.f[2] + extract.f[3];
}

//////////////////////////////////////////////////////////////////////
// scalar 'a' x + y
// Y = aX + Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::saxpySSE(float s, float* x, float* y)
{
  __m128* ySSE = (__m128*)y;
  __m128* xSSE = (__m128*)x;
  __m128 sSSE = _mm_set_ps1(s);
  __m128 temp;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    temp = _mm_mul_ps(*xSSE, sSSE);
    *ySSE = _mm_add_ps(*ySSE, temp);

    xSSE++;
    ySSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// scalar 'a' y + x
// Y = aY + X
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::saypxSSE(float s, float* x, float* y)
{
  __m128* ySSE = (__m128*)y;
  __m128* xSSE = (__m128*)x;
  __m128 sSSE = _mm_set_ps1(s);
  __m128 temp;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    temp = _mm_mul_ps(*ySSE, sSSE);
    *ySSE = _mm_add_ps(*xSSE, temp);

    xSSE++;
    ySSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// scalar 'a' y + x
// Y = aY + X
//////////////////////////////////////////////////////////////////////
float CG_SOLVER_SSE::maxSSE(float* x)
{
  __m128 maxFoundSSE= _mm_set_ps1(0.0f);
  __m128* xSSE = (__m128*)x;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    maxFoundSSE = _mm_max_ps(*xSSE, maxFoundSSE);
    xSSE++;
  }
  union u {
    __m128 m;
    float f[4];
  } extract;
  extract.m = maxFoundSSE;
  float maxFound = extract.f[0] > extract.f[1] ? extract.f[0] : extract.f[1];
  maxFound = maxFound > extract.f[2] ? maxFound : extract.f[2];
  return maxFound > extract.f[3] ? maxFound : extract.f[3];
}

//////////////////////////////////////////////////////////////////////
// SSE add
// Y = X + Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::addSSE(float* x, float* y)
{
  __m128* ySSE = (__m128*)y;
  __m128* xSSE = (__m128*)x;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    *ySSE = _mm_add_ps(*ySSE, *xSSE);
    xSSE++;
    ySSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// SSE multiply
// Y = X * Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::multiplySSE(float* x, float* y)
{
  __m128* ySSE = (__m128*)y;
  __m128* xSSE = (__m128*)x;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    *ySSE = _mm_mul_ps(*ySSE, *xSSE);
    xSSE++;
    ySSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// SSE multiply
// Z = X * Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::multiplySSE(float* x, float* y, float* z)
{
  __m128* zSSE = (__m128*)z;
  __m128* ySSE = (__m128*)y;
  __m128* xSSE = (__m128*)x;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    *zSSE = _mm_mul_ps(*ySSE, *xSSE);
    xSSE++;
    ySSE++;
    zSSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// SSE multiply
// Z = W - X * Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::multiplySubtractSSE(float* w, float* x, float* y, float* z)
{
  __m128* zSSE = (__m128*)z;
  __m128* ySSE = (__m128*)y;
  __m128* xSSE = (__m128*)x;
  __m128* wSSE = (__m128*)w;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    *zSSE = _mm_mul_ps(*ySSE, *xSSE);
    *zSSE = _mm_sub_ps(*wSSE, *zSSE);
    
    xSSE++;
    ySSE++;
    zSSE++;
    wSSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// SSE set
// X = val
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::setSSE(float* x, float val)
{
  __m128* xSSE = (__m128*)x;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    *xSSE = _mm_set_ps1(val);
    xSSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// SSE set
// X = 0
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::wipeSSE(float* x)
{
  __m128* xSSE = (__m128*)x;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    *xSSE = _mm_setzero_ps();
    xSSE++;
  }
}

//////////////////////////////////////////////////////////////////////
// SSE set
// X = Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::copySSE(float* x, float* y)
{
  __m128* ySSE = (__m128*)y;
  for (int index = 0; index < _arraySize / 4; index++)
  {
    _mm_store_ps(x,*ySSE);    
    x += 4;
    ySSE++;
  }
}