Initial commit
This commit is contained in:
		
				commit
				
					
						7cdac8ded9
					
				
			
		
					 45 changed files with 7275 additions and 0 deletions
				
			
		
							
								
								
									
										416
									
								
								CG_SOLVER_SSE.cpp
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										416
									
								
								CG_SOLVER_SSE.cpp
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,416 @@ | |||
| ///////////////////////////////////////////////////////////////////////////////////
 | ||||
| // File : CG_SOLVER_SSE.cpp
 | ||||
| ///////////////////////////////////////////////////////////////////////////////////
 | ||||
| //
 | ||||
| // LumosQuad - A Lightning Generator
 | ||||
| // Copyright 2007
 | ||||
| // The University of North Carolina at Chapel Hill
 | ||||
| // 
 | ||||
| ///////////////////////////////////////////////////////////////////////////////////
 | ||||
| //
 | ||||
| //  This program is free software; you can redistribute it and/or modify
 | ||||
| //  it under the terms of the GNU General Public License as published by
 | ||||
| //  the Free Software Foundation; either version 2 of the License, or
 | ||||
| //  (at your option) any later version.
 | ||||
| //
 | ||||
| //  This program is distributed in the hope that it will be useful,
 | ||||
| //  but WITHOUT ANY WARRANTY; without even the implied warranty of
 | ||||
| //  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | ||||
| //  GNU General Public License for more details.
 | ||||
| // 
 | ||||
| //  You should have received a copy of the GNU General Public License
 | ||||
| //  along with this program; if not, write to the Free Software
 | ||||
| //  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 | ||||
| //
 | ||||
| //  The University of North Carolina at Chapel Hill makes no representations 
 | ||||
| //  about the suitability of this software for any purpose. It is provided 
 | ||||
| //  "as is" without express or implied warranty.
 | ||||
| //
 | ||||
| //  Permission to use, copy, modify and distribute this software and its
 | ||||
| //  documentation for educational, research and non-profit purposes, without
 | ||||
| //  fee, and without a written agreement is hereby granted, provided that the
 | ||||
| //  above copyright notice and the following three paragraphs appear in all
 | ||||
| //  copies.
 | ||||
| //
 | ||||
| //  THE UNIVERSITY OF NORTH CAROLINA SPECIFICALLY DISCLAIM ANY WARRANTIES,
 | ||||
| //  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
 | ||||
| //  FITNESS FOR A PARTICULAR PURPOSE.  THE SOFTWARE PROVIDED HEREUNDER IS ON AN
 | ||||
| //  "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA HAS NO OBLIGATION TO
 | ||||
| //  PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
 | ||||
| //
 | ||||
| //  Please send questions and comments about LumosQuad to kim@cs.unc.edu.
 | ||||
| //
 | ||||
| ///////////////////////////////////////////////////////////////////////////////////
 | ||||
| //
 | ||||
| //  This program uses OpenEXR, which has the following restrictions:
 | ||||
| // 
 | ||||
| //  Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
 | ||||
| //  Digital Ltd. LLC
 | ||||
| // 
 | ||||
| //  All rights reserved.
 | ||||
| // 
 | ||||
| //  Redistribution and use in source and binary forms, with or without
 | ||||
| //  modification, are permitted provided that the following conditions are
 | ||||
| //  met:
 | ||||
| //  *       Redistributions of source code must retain the above copyright
 | ||||
| //  notice, this list of conditions and the following disclaimer.
 | ||||
| //  *       Redistributions in binary form must reproduce the above
 | ||||
| //  copyright notice, this list of conditions and the following disclaimer
 | ||||
| //  in the documentation and/or other materials provided with the
 | ||||
| //  distribution.
 | ||||
| //  *       Neither the name of Industrial Light & Magic nor the names of
 | ||||
| //  its contributors may be used to endorse or promote products derived
 | ||||
| //  from this software without specific prior written permission. 
 | ||||
| // 
 | ||||
| 
 | ||||
| #include "CG_SOLVER_SSE.h" | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // Construction/Destruction
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| 
 | ||||
| CG_SOLVER_SSE::CG_SOLVER_SSE(int maxDepth, int iterations, int digits) : | ||||
|   CG_SOLVER(maxDepth, iterations, digits) | ||||
| { | ||||
| } | ||||
| 
 | ||||
| CG_SOLVER_SSE::~CG_SOLVER_SSE() | ||||
| { | ||||
|   if (_direction) free(_direction); | ||||
|   if (_potential) free(_potential); | ||||
|   if (_residual)  free(_residual); | ||||
|   if (_q)         free(_q); | ||||
| 
 | ||||
|   _direction = NULL; | ||||
|   _residual  = NULL; | ||||
|   _q         = NULL; | ||||
|   _potential = NULL; | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // reallocate the sse arrays
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::reallocate() | ||||
| { | ||||
|   if (_arraySize >= _listSize) return; | ||||
|   _arraySize = _listSize * 2; | ||||
| 
 | ||||
|   if (_arraySize % 4) | ||||
|     _arraySize += 4 - _arraySize % 4; | ||||
|    | ||||
|   if (_direction) free(_direction); | ||||
|   if (_potential) free(_potential); | ||||
|   if (_residual)  free(_residual); | ||||
|   if (_q)         free(_q); | ||||
| 
 | ||||
|   _direction = (float*) aligned_alloc(_arraySize * sizeof(float), 16);  | ||||
|   _potential = (float*) aligned_alloc(_arraySize * sizeof(float), 16);  | ||||
|   _residual  = (float*) aligned_alloc(_arraySize * sizeof(float), 16);  | ||||
|   _q         = (float*) aligned_alloc(_arraySize * sizeof(float), 16);  | ||||
| 
 | ||||
|   return; | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // solve the linear system
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| int CG_SOLVER_SSE::solve(list<CELL*> cells) | ||||
| { | ||||
|   // counters
 | ||||
|   int x, y, index; | ||||
|   list<CELL*>::iterator cellIterator; | ||||
| 
 | ||||
|   // i = 0
 | ||||
|   int i = 0; | ||||
| 
 | ||||
|   // precalculate stencils
 | ||||
|   calcStencils(cells); | ||||
|   | ||||
|   // reallocate scratch arrays if necessary
 | ||||
|   _listSize = cells.size(); | ||||
|   reallocate(); | ||||
|   wipeSSE(_potential); | ||||
|   wipeSSE(_direction); | ||||
|   wipeSSE(_residual); | ||||
|   wipeSSE(_q); | ||||
|    | ||||
|   // compute a new lexicographical order
 | ||||
|   cellIterator = cells.begin(); | ||||
|   for (x = 0; x < _listSize; x++, cellIterator++) | ||||
|   { | ||||
|     CELL* cell = *cellIterator; | ||||
|     cell->index = x; | ||||
|     _potential[x] = cell->potential; | ||||
|   } | ||||
| 
 | ||||
|   // r = b - Ax
 | ||||
|   calcResidual(cells); | ||||
| 
 | ||||
|   // d = r
 | ||||
|   copySSE(_direction, _residual); | ||||
|    | ||||
|   // deltaNew = r^T r
 | ||||
|   float deltaNew = dotSSE(_residual, _residual); | ||||
| 
 | ||||
|   // delta0 = deltaNew
 | ||||
|   float delta0 = deltaNew; | ||||
| 
 | ||||
|   // While deltaNew > (eps^2) * delta0
 | ||||
|   float eps  = pow(10.0f, (float)-_digits); | ||||
|   float maxR = 2.0f * eps; | ||||
|   while ((i < _iterations) && (maxR > eps)) | ||||
|   { | ||||
|     // q = Ad
 | ||||
|     cellIterator = cells.begin(); | ||||
|     for (y = 0; y < _listSize; y++, cellIterator++) | ||||
|     { | ||||
|       CELL* currentCell = *cellIterator; | ||||
|       CELL** neighbors = currentCell->neighbors; | ||||
|       float* stencil = currentCell->stencil; | ||||
|       | ||||
|       float neighborSum = 0.0f; | ||||
|       for (int x = 0; x < 8; x++) | ||||
|       { | ||||
|         if (neighbors[x]) | ||||
|           neighborSum += _direction[neighbors[x]->index] * stencil[x]; | ||||
|       } | ||||
|       _q[y] = -neighborSum + _direction[y] * currentCell->stencil[8]; | ||||
|     } | ||||
| 
 | ||||
|     // alpha = deltaNew / (transpose(d) * q)
 | ||||
|     float alpha = dotSSE(_q, _direction); | ||||
|     if (fabs(alpha) > 0.0f) | ||||
|       alpha = deltaNew / alpha; | ||||
| 
 | ||||
|     // x = x + alpha * d
 | ||||
|     saxpySSE(alpha, _direction, _potential); | ||||
| 
 | ||||
|     // r = r - alpha * q
 | ||||
|     saxpySSE(-alpha, _q, _residual); | ||||
|     maxR = maxSSE(_residual); | ||||
|      | ||||
|     // deltaOld = deltaNew
 | ||||
|     float deltaOld = deltaNew; | ||||
| 
 | ||||
|     // deltaNew = transpose(r) * r
 | ||||
|     deltaNew = dotSSE(_residual, _residual); | ||||
| 
 | ||||
|     // beta = deltaNew / deltaOld
 | ||||
|     float beta = deltaNew / deltaOld; | ||||
| 
 | ||||
|     // d = r + beta * d
 | ||||
|     saypxSSE(beta, _residual, _direction); | ||||
| 
 | ||||
|     // i = i + 1
 | ||||
|     i++; | ||||
|   } | ||||
| 
 | ||||
|   // copy back into the tree
 | ||||
|   cellIterator = cells.begin(); | ||||
|   for (x = 0; x < _listSize; x++, cellIterator++) | ||||
|     (*cellIterator)->potential = _potential[x]; | ||||
| 
 | ||||
|   return i; | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // dot product of two vectors
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| float CG_SOLVER_SSE::dotSSE(float* x, float* y) | ||||
| { | ||||
|   __m128 sum = _mm_set_ps1(0.0f); | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128 temp; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     temp = _mm_mul_ps(*xSSE, *ySSE); | ||||
|     sum  = _mm_add_ps(sum, temp); | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|   } | ||||
|   union u { | ||||
|     __m128 m; | ||||
|     float f[4]; | ||||
|   } extract; | ||||
|   extract.m = sum; | ||||
|   return extract.f[0] + extract.f[1] + extract.f[2] + extract.f[3]; | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // scalar 'a' x + y
 | ||||
| // Y = aX + Y
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::saxpySSE(float s, float* x, float* y) | ||||
| { | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   __m128 sSSE = _mm_set_ps1(s); | ||||
|   __m128 temp; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     temp = _mm_mul_ps(*xSSE, sSSE); | ||||
|     *ySSE = _mm_add_ps(*ySSE, temp); | ||||
| 
 | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // scalar 'a' y + x
 | ||||
| // Y = aY + X
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::saypxSSE(float s, float* x, float* y) | ||||
| { | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   __m128 sSSE = _mm_set_ps1(s); | ||||
|   __m128 temp; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     temp = _mm_mul_ps(*ySSE, sSSE); | ||||
|     *ySSE = _mm_add_ps(*xSSE, temp); | ||||
| 
 | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // scalar 'a' y + x
 | ||||
| // Y = aY + X
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| float CG_SOLVER_SSE::maxSSE(float* x) | ||||
| { | ||||
|   __m128 maxFoundSSE= _mm_set_ps1(0.0f); | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     maxFoundSSE = _mm_max_ps(*xSSE, maxFoundSSE); | ||||
|     xSSE++; | ||||
|   } | ||||
|   union u { | ||||
|     __m128 m; | ||||
|     float f[4]; | ||||
|   } extract; | ||||
|   extract.m = maxFoundSSE; | ||||
|   float maxFound = extract.f[0] > extract.f[1] ? extract.f[0] : extract.f[1]; | ||||
|   maxFound = maxFound > extract.f[2] ? maxFound : extract.f[2]; | ||||
|   return maxFound > extract.f[3] ? maxFound : extract.f[3]; | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE add
 | ||||
| // Y = X + Y
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::addSSE(float* x, float* y) | ||||
| { | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     *ySSE = _mm_add_ps(*ySSE, *xSSE); | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE multiply
 | ||||
| // Y = X * Y
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::multiplySSE(float* x, float* y) | ||||
| { | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     *ySSE = _mm_mul_ps(*ySSE, *xSSE); | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE multiply
 | ||||
| // Z = X * Y
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::multiplySSE(float* x, float* y, float* z) | ||||
| { | ||||
|   __m128* zSSE = (__m128*)z; | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     *zSSE = _mm_mul_ps(*ySSE, *xSSE); | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|     zSSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE multiply
 | ||||
| // Z = W - X * Y
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::multiplySubtractSSE(float* w, float* x, float* y, float* z) | ||||
| { | ||||
|   __m128* zSSE = (__m128*)z; | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   __m128* wSSE = (__m128*)w; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     *zSSE = _mm_mul_ps(*ySSE, *xSSE); | ||||
|     *zSSE = _mm_sub_ps(*wSSE, *zSSE); | ||||
|      | ||||
|     xSSE++; | ||||
|     ySSE++; | ||||
|     zSSE++; | ||||
|     wSSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE set
 | ||||
| // X = val
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::setSSE(float* x, float val) | ||||
| { | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     *xSSE = _mm_set_ps1(val); | ||||
|     xSSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE set
 | ||||
| // X = 0
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::wipeSSE(float* x) | ||||
| { | ||||
|   __m128* xSSE = (__m128*)x; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     *xSSE = _mm_setzero_ps(); | ||||
|     xSSE++; | ||||
|   } | ||||
| } | ||||
| 
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| // SSE set
 | ||||
| // X = Y
 | ||||
| //////////////////////////////////////////////////////////////////////
 | ||||
| void CG_SOLVER_SSE::copySSE(float* x, float* y) | ||||
| { | ||||
|   __m128* ySSE = (__m128*)y; | ||||
|   for (int index = 0; index < _arraySize / 4; index++) | ||||
|   { | ||||
|     _mm_store_ps(x,*ySSE);     | ||||
|     x += 4; | ||||
|     ySSE++; | ||||
|   } | ||||
| } | ||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue