416 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			416 lines
		
	
	
	
		
			12 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| ///////////////////////////////////////////////////////////////////////////////////
 | |
| // File : CG_SOLVER_SSE.cpp
 | |
| ///////////////////////////////////////////////////////////////////////////////////
 | |
| //
 | |
| // LumosQuad - A Lightning Generator
 | |
| // Copyright 2007
 | |
| // The University of North Carolina at Chapel Hill
 | |
| // 
 | |
| ///////////////////////////////////////////////////////////////////////////////////
 | |
| //
 | |
| //  This program is free software; you can redistribute it and/or modify
 | |
| //  it under the terms of the GNU General Public License as published by
 | |
| //  the Free Software Foundation; either version 2 of the License, or
 | |
| //  (at your option) any later version.
 | |
| //
 | |
| //  This program is distributed in the hope that it will be useful,
 | |
| //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 | |
| //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | |
| //  GNU General Public License for more details.
 | |
| // 
 | |
| //  You should have received a copy of the GNU General Public License
 | |
| //  along with this program; if not, write to the Free Software
 | |
| //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | |
| //
 | |
| //  The University of North Carolina at Chapel Hill makes no representations 
 | |
| //  about the suitability of this software for any purpose. It is provided 
 | |
| //  "as is" without express or implied warranty.
 | |
| //
 | |
| //  Permission to use, copy, modify and distribute this software and its
 | |
| //  documentation for educational, research and non-profit purposes, without
 | |
| //  fee, and without a written agreement is hereby granted, provided that the
 | |
| //  above copyright notice and the following three paragraphs appear in all
 | |
| //  copies.
 | |
| //
 | |
| //  THE UNIVERSITY OF NORTH CAROLINA SPECIFICALLY DISCLAIM ANY WARRANTIES,
 | |
| //  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 | |
| //  FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN
 | |
| //  "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA HAS NO OBLIGATION TO
 | |
| //  PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 | |
| //
 | |
| //  Please send questions and comments about LumosQuad to kim@cs.unc.edu.
 | |
| //
 | |
| ///////////////////////////////////////////////////////////////////////////////////
 | |
| //
 | |
| //  This program uses OpenEXR, which has the following restrictions:
 | |
| // 
 | |
| //  Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
 | |
| //  Digital Ltd. LLC
 | |
| // 
 | |
| //  All rights reserved.
 | |
| // 
 | |
| //  Redistribution and use in source and binary forms, with or without
 | |
| //  modification, are permitted provided that the following conditions are
 | |
| //  met:
 | |
| //  *       Redistributions of source code must retain the above copyright
 | |
| //  notice, this list of conditions and the following disclaimer.
 | |
| //  *       Redistributions in binary form must reproduce the above
 | |
| //  copyright notice, this list of conditions and the following disclaimer
 | |
| //  in the documentation and/or other materials provided with the
 | |
| //  distribution.
 | |
| //  *       Neither the name of Industrial Light & Magic nor the names of
 | |
| //  its contributors may be used to endorse or promote products derived
 | |
| //  from this software without specific prior written permission. 
 | |
| // 
 | |
| 
 | |
| #include "CG_SOLVER_SSE.h"
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // Construction/Destruction
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| 
 | |
| CG_SOLVER_SSE::CG_SOLVER_SSE(int maxDepth, int iterations, int digits) :
 | |
|   CG_SOLVER(maxDepth, iterations, digits)
 | |
| {
 | |
| }
 | |
| 
 | |
| CG_SOLVER_SSE::~CG_SOLVER_SSE()
 | |
| {
 | |
|   if (_direction) free(_direction);
 | |
|   if (_potential) free(_potential);
 | |
|   if (_residual)  free(_residual);
 | |
|   if (_q)         free(_q);
 | |
| 
 | |
|   _direction = NULL;
 | |
|   _residual  = NULL;
 | |
|   _q         = NULL;
 | |
|   _potential = NULL;
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // reallocate the sse arrays
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::reallocate()
 | |
| {
 | |
|   if (_arraySize >= _listSize) return;
 | |
|   _arraySize = _listSize * 2;
 | |
| 
 | |
|   if (_arraySize % 4)
 | |
|     _arraySize += 4 - _arraySize % 4;
 | |
|   
 | |
|   if (_direction) free(_direction);
 | |
|   if (_potential) free(_potential);
 | |
|   if (_residual)  free(_residual);
 | |
|   if (_q)         free(_q);
 | |
| 
 | |
|   _direction = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
 | |
|   _potential = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
 | |
|   _residual  = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
 | |
|   _q         = (float*) aligned_alloc(_arraySize * sizeof(float), 16); 
 | |
| 
 | |
|   return;
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // solve the linear system
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| int CG_SOLVER_SSE::solve(list<CELL*> cells)
 | |
| {
 | |
|   // counters
 | |
|   int x, y, index;
 | |
|   list<CELL*>::iterator cellIterator;
 | |
| 
 | |
|   // i = 0
 | |
|   int i = 0;
 | |
| 
 | |
|   // precalculate stencils
 | |
|   calcStencils(cells);
 | |
|  
 | |
|   // reallocate scratch arrays if necessary
 | |
|   _listSize = cells.size();
 | |
|   reallocate();
 | |
|   wipeSSE(_potential);
 | |
|   wipeSSE(_direction);
 | |
|   wipeSSE(_residual);
 | |
|   wipeSSE(_q);
 | |
|   
 | |
|   // compute a new lexicographical order
 | |
|   cellIterator = cells.begin();
 | |
|   for (x = 0; x < _listSize; x++, cellIterator++)
 | |
|   {
 | |
|     CELL* cell = *cellIterator;
 | |
|     cell->index = x;
 | |
|     _potential[x] = cell->potential;
 | |
|   }
 | |
| 
 | |
|   // r = b - Ax
 | |
|   calcResidual(cells);
 | |
| 
 | |
|   // d = r
 | |
|   copySSE(_direction, _residual);
 | |
|   
 | |
|   // deltaNew = r^T r
 | |
|   float deltaNew = dotSSE(_residual, _residual);
 | |
| 
 | |
|   // delta0 = deltaNew
 | |
|   float delta0 = deltaNew;
 | |
| 
 | |
|   // While deltaNew > (eps^2) * delta0
 | |
|   float eps  = pow(10.0f, (float)-_digits);
 | |
|   float maxR = 2.0f * eps;
 | |
|   while ((i < _iterations) && (maxR > eps))
 | |
|   {
 | |
|     // q = Ad
 | |
|     cellIterator = cells.begin();
 | |
|     for (y = 0; y < _listSize; y++, cellIterator++)
 | |
|     {
 | |
|       CELL* currentCell = *cellIterator;
 | |
|       CELL** neighbors = currentCell->neighbors;
 | |
|       float* stencil = currentCell->stencil;
 | |
|      
 | |
|       float neighborSum = 0.0f;
 | |
|       for (int x = 0; x < 8; x++)
 | |
|       {
 | |
|         if (neighbors[x])
 | |
|           neighborSum += _direction[neighbors[x]->index] * stencil[x];
 | |
|       }
 | |
|       _q[y] = -neighborSum + _direction[y] * currentCell->stencil[8];
 | |
|     }
 | |
| 
 | |
|     // alpha = deltaNew / (transpose(d) * q)
 | |
|     float alpha = dotSSE(_q, _direction);
 | |
|     if (fabs(alpha) > 0.0f)
 | |
|       alpha = deltaNew / alpha;
 | |
| 
 | |
|     // x = x + alpha * d
 | |
|     saxpySSE(alpha, _direction, _potential);
 | |
| 
 | |
|     // r = r - alpha * q
 | |
|     saxpySSE(-alpha, _q, _residual);
 | |
|     maxR = maxSSE(_residual);
 | |
|     
 | |
|     // deltaOld = deltaNew
 | |
|     float deltaOld = deltaNew;
 | |
| 
 | |
|     // deltaNew = transpose(r) * r
 | |
|     deltaNew = dotSSE(_residual, _residual);
 | |
| 
 | |
|     // beta = deltaNew / deltaOld
 | |
|     float beta = deltaNew / deltaOld;
 | |
| 
 | |
|     // d = r + beta * d
 | |
|     saypxSSE(beta, _residual, _direction);
 | |
| 
 | |
|     // i = i + 1
 | |
|     i++;
 | |
|   }
 | |
| 
 | |
|   // copy back into the tree
 | |
|   cellIterator = cells.begin();
 | |
|   for (x = 0; x < _listSize; x++, cellIterator++)
 | |
|     (*cellIterator)->potential = _potential[x];
 | |
| 
 | |
|   return i;
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // dot product of two vectors
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| float CG_SOLVER_SSE::dotSSE(float* x, float* y)
 | |
| {
 | |
|   __m128 sum = _mm_set_ps1(0.0f);
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128 temp;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     temp = _mm_mul_ps(*xSSE, *ySSE);
 | |
|     sum  = _mm_add_ps(sum, temp);
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|   }
 | |
|   union u {
 | |
|     __m128 m;
 | |
|     float f[4];
 | |
|   } extract;
 | |
|   extract.m = sum;
 | |
|   return extract.f[0] + extract.f[1] + extract.f[2] + extract.f[3];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // scalar 'a' x + y
 | |
| // Y = aX + Y
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::saxpySSE(float s, float* x, float* y)
 | |
| {
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   __m128 sSSE = _mm_set_ps1(s);
 | |
|   __m128 temp;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     temp = _mm_mul_ps(*xSSE, sSSE);
 | |
|     *ySSE = _mm_add_ps(*ySSE, temp);
 | |
| 
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // scalar 'a' y + x
 | |
| // Y = aY + X
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::saypxSSE(float s, float* x, float* y)
 | |
| {
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   __m128 sSSE = _mm_set_ps1(s);
 | |
|   __m128 temp;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     temp = _mm_mul_ps(*ySSE, sSSE);
 | |
|     *ySSE = _mm_add_ps(*xSSE, temp);
 | |
| 
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // scalar 'a' y + x
 | |
| // Y = aY + X
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| float CG_SOLVER_SSE::maxSSE(float* x)
 | |
| {
 | |
|   __m128 maxFoundSSE= _mm_set_ps1(0.0f);
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     maxFoundSSE = _mm_max_ps(*xSSE, maxFoundSSE);
 | |
|     xSSE++;
 | |
|   }
 | |
|   union u {
 | |
|     __m128 m;
 | |
|     float f[4];
 | |
|   } extract;
 | |
|   extract.m = maxFoundSSE;
 | |
|   float maxFound = extract.f[0] > extract.f[1] ? extract.f[0] : extract.f[1];
 | |
|   maxFound = maxFound > extract.f[2] ? maxFound : extract.f[2];
 | |
|   return maxFound > extract.f[3] ? maxFound : extract.f[3];
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE add
 | |
| // Y = X + Y
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::addSSE(float* x, float* y)
 | |
| {
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     *ySSE = _mm_add_ps(*ySSE, *xSSE);
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE multiply
 | |
| // Y = X * Y
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::multiplySSE(float* x, float* y)
 | |
| {
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     *ySSE = _mm_mul_ps(*ySSE, *xSSE);
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE multiply
 | |
| // Z = X * Y
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::multiplySSE(float* x, float* y, float* z)
 | |
| {
 | |
|   __m128* zSSE = (__m128*)z;
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     *zSSE = _mm_mul_ps(*ySSE, *xSSE);
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|     zSSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE multiply
 | |
| // Z = W - X * Y
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::multiplySubtractSSE(float* w, float* x, float* y, float* z)
 | |
| {
 | |
|   __m128* zSSE = (__m128*)z;
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   __m128* wSSE = (__m128*)w;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     *zSSE = _mm_mul_ps(*ySSE, *xSSE);
 | |
|     *zSSE = _mm_sub_ps(*wSSE, *zSSE);
 | |
|     
 | |
|     xSSE++;
 | |
|     ySSE++;
 | |
|     zSSE++;
 | |
|     wSSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE set
 | |
| // X = val
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::setSSE(float* x, float val)
 | |
| {
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     *xSSE = _mm_set_ps1(val);
 | |
|     xSSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE set
 | |
| // X = 0
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::wipeSSE(float* x)
 | |
| {
 | |
|   __m128* xSSE = (__m128*)x;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     *xSSE = _mm_setzero_ps();
 | |
|     xSSE++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| // SSE set
 | |
| // X = Y
 | |
| //////////////////////////////////////////////////////////////////////
 | |
| void CG_SOLVER_SSE::copySSE(float* x, float* y)
 | |
| {
 | |
|   __m128* ySSE = (__m128*)y;
 | |
|   for (int index = 0; index < _arraySize / 4; index++)
 | |
|   {
 | |
|     _mm_store_ps(x,*ySSE);    
 | |
|     x += 4;
 | |
|     ySSE++;
 | |
|   }
 | |
| }
 |