lumosquad/CG_SOLVER_SSE.cpp
2025-07-26 22:14:37 +02:00

416 lines
12 KiB
C++

///////////////////////////////////////////////////////////////////////////////////
// File : CG_SOLVER_SSE.cpp
///////////////////////////////////////////////////////////////////////////////////
//
// LumosQuad - A Lightning Generator
// Copyright 2007
// The University of North Carolina at Chapel Hill
//
///////////////////////////////////////////////////////////////////////////////////
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// The University of North Carolina at Chapel Hill makes no representations
// about the suitability of this software for any purpose. It is provided
// "as is" without express or implied warranty.
//
// Permission to use, copy, modify and distribute this software and its
// documentation for educational, research and non-profit purposes, without
// fee, and without a written agreement is hereby granted, provided that the
// above copyright notice and the following three paragraphs appear in all
// copies.
//
// THE UNIVERSITY OF NORTH CAROLINA SPECIFICALLY DISCLAIM ANY WARRANTIES,
// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
// FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN
// "AS IS" BASIS, AND THE UNIVERSITY OF NORTH CAROLINA HAS NO OBLIGATION TO
// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
//
// Please send questions and comments about LumosQuad to kim@cs.unc.edu.
//
///////////////////////////////////////////////////////////////////////////////////
//
// This program uses OpenEXR, which has the following restrictions:
//
// Copyright (c) 2002, Industrial Light & Magic, a division of Lucas
// Digital Ltd. LLC
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Industrial Light & Magic nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
#include "CG_SOLVER_SSE.h"
//////////////////////////////////////////////////////////////////////
// Construction/Destruction
//////////////////////////////////////////////////////////////////////
CG_SOLVER_SSE::CG_SOLVER_SSE(int maxDepth, int iterations, int digits) :
CG_SOLVER(maxDepth, iterations, digits)
{
}
CG_SOLVER_SSE::~CG_SOLVER_SSE()
{
if (_direction) free(_direction);
if (_potential) free(_potential);
if (_residual) free(_residual);
if (_q) free(_q);
_direction = NULL;
_residual = NULL;
_q = NULL;
_potential = NULL;
}
//////////////////////////////////////////////////////////////////////
// reallocate the sse arrays
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::reallocate()
{
if (_arraySize >= _listSize) return;
_arraySize = _listSize * 2;
if (_arraySize % 4)
_arraySize += 4 - _arraySize % 4;
if (_direction) free(_direction);
if (_potential) free(_potential);
if (_residual) free(_residual);
if (_q) free(_q);
_direction = (float*) aligned_alloc(_arraySize * sizeof(float), 16);
_potential = (float*) aligned_alloc(_arraySize * sizeof(float), 16);
_residual = (float*) aligned_alloc(_arraySize * sizeof(float), 16);
_q = (float*) aligned_alloc(_arraySize * sizeof(float), 16);
return;
}
//////////////////////////////////////////////////////////////////////
// solve the linear system
//////////////////////////////////////////////////////////////////////
int CG_SOLVER_SSE::solve(list<CELL*> cells)
{
// counters
int x, y, index;
list<CELL*>::iterator cellIterator;
// i = 0
int i = 0;
// precalculate stencils
calcStencils(cells);
// reallocate scratch arrays if necessary
_listSize = cells.size();
reallocate();
wipeSSE(_potential);
wipeSSE(_direction);
wipeSSE(_residual);
wipeSSE(_q);
// compute a new lexicographical order
cellIterator = cells.begin();
for (x = 0; x < _listSize; x++, cellIterator++)
{
CELL* cell = *cellIterator;
cell->index = x;
_potential[x] = cell->potential;
}
// r = b - Ax
calcResidual(cells);
// d = r
copySSE(_direction, _residual);
// deltaNew = r^T r
float deltaNew = dotSSE(_residual, _residual);
// delta0 = deltaNew
float delta0 = deltaNew;
// While deltaNew > (eps^2) * delta0
float eps = pow(10.0f, (float)-_digits);
float maxR = 2.0f * eps;
while ((i < _iterations) && (maxR > eps))
{
// q = Ad
cellIterator = cells.begin();
for (y = 0; y < _listSize; y++, cellIterator++)
{
CELL* currentCell = *cellIterator;
CELL** neighbors = currentCell->neighbors;
float* stencil = currentCell->stencil;
float neighborSum = 0.0f;
for (int x = 0; x < 8; x++)
{
if (neighbors[x])
neighborSum += _direction[neighbors[x]->index] * stencil[x];
}
_q[y] = -neighborSum + _direction[y] * currentCell->stencil[8];
}
// alpha = deltaNew / (transpose(d) * q)
float alpha = dotSSE(_q, _direction);
if (fabs(alpha) > 0.0f)
alpha = deltaNew / alpha;
// x = x + alpha * d
saxpySSE(alpha, _direction, _potential);
// r = r - alpha * q
saxpySSE(-alpha, _q, _residual);
maxR = maxSSE(_residual);
// deltaOld = deltaNew
float deltaOld = deltaNew;
// deltaNew = transpose(r) * r
deltaNew = dotSSE(_residual, _residual);
// beta = deltaNew / deltaOld
float beta = deltaNew / deltaOld;
// d = r + beta * d
saypxSSE(beta, _residual, _direction);
// i = i + 1
i++;
}
// copy back into the tree
cellIterator = cells.begin();
for (x = 0; x < _listSize; x++, cellIterator++)
(*cellIterator)->potential = _potential[x];
return i;
}
//////////////////////////////////////////////////////////////////////
// dot product of two vectors
//////////////////////////////////////////////////////////////////////
float CG_SOLVER_SSE::dotSSE(float* x, float* y)
{
__m128 sum = _mm_set_ps1(0.0f);
__m128* xSSE = (__m128*)x;
__m128* ySSE = (__m128*)y;
__m128 temp;
for (int index = 0; index < _arraySize / 4; index++)
{
temp = _mm_mul_ps(*xSSE, *ySSE);
sum = _mm_add_ps(sum, temp);
xSSE++;
ySSE++;
}
union u {
__m128 m;
float f[4];
} extract;
extract.m = sum;
return extract.f[0] + extract.f[1] + extract.f[2] + extract.f[3];
}
//////////////////////////////////////////////////////////////////////
// scalar 'a' x + y
// Y = aX + Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::saxpySSE(float s, float* x, float* y)
{
__m128* ySSE = (__m128*)y;
__m128* xSSE = (__m128*)x;
__m128 sSSE = _mm_set_ps1(s);
__m128 temp;
for (int index = 0; index < _arraySize / 4; index++)
{
temp = _mm_mul_ps(*xSSE, sSSE);
*ySSE = _mm_add_ps(*ySSE, temp);
xSSE++;
ySSE++;
}
}
//////////////////////////////////////////////////////////////////////
// scalar 'a' y + x
// Y = aY + X
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::saypxSSE(float s, float* x, float* y)
{
__m128* ySSE = (__m128*)y;
__m128* xSSE = (__m128*)x;
__m128 sSSE = _mm_set_ps1(s);
__m128 temp;
for (int index = 0; index < _arraySize / 4; index++)
{
temp = _mm_mul_ps(*ySSE, sSSE);
*ySSE = _mm_add_ps(*xSSE, temp);
xSSE++;
ySSE++;
}
}
//////////////////////////////////////////////////////////////////////
// scalar 'a' y + x
// Y = aY + X
//////////////////////////////////////////////////////////////////////
float CG_SOLVER_SSE::maxSSE(float* x)
{
__m128 maxFoundSSE= _mm_set_ps1(0.0f);
__m128* xSSE = (__m128*)x;
for (int index = 0; index < _arraySize / 4; index++)
{
maxFoundSSE = _mm_max_ps(*xSSE, maxFoundSSE);
xSSE++;
}
union u {
__m128 m;
float f[4];
} extract;
extract.m = maxFoundSSE;
float maxFound = extract.f[0] > extract.f[1] ? extract.f[0] : extract.f[1];
maxFound = maxFound > extract.f[2] ? maxFound : extract.f[2];
return maxFound > extract.f[3] ? maxFound : extract.f[3];
}
//////////////////////////////////////////////////////////////////////
// SSE add
// Y = X + Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::addSSE(float* x, float* y)
{
__m128* ySSE = (__m128*)y;
__m128* xSSE = (__m128*)x;
for (int index = 0; index < _arraySize / 4; index++)
{
*ySSE = _mm_add_ps(*ySSE, *xSSE);
xSSE++;
ySSE++;
}
}
//////////////////////////////////////////////////////////////////////
// SSE multiply
// Y = X * Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::multiplySSE(float* x, float* y)
{
__m128* ySSE = (__m128*)y;
__m128* xSSE = (__m128*)x;
for (int index = 0; index < _arraySize / 4; index++)
{
*ySSE = _mm_mul_ps(*ySSE, *xSSE);
xSSE++;
ySSE++;
}
}
//////////////////////////////////////////////////////////////////////
// SSE multiply
// Z = X * Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::multiplySSE(float* x, float* y, float* z)
{
__m128* zSSE = (__m128*)z;
__m128* ySSE = (__m128*)y;
__m128* xSSE = (__m128*)x;
for (int index = 0; index < _arraySize / 4; index++)
{
*zSSE = _mm_mul_ps(*ySSE, *xSSE);
xSSE++;
ySSE++;
zSSE++;
}
}
//////////////////////////////////////////////////////////////////////
// SSE multiply
// Z = W - X * Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::multiplySubtractSSE(float* w, float* x, float* y, float* z)
{
__m128* zSSE = (__m128*)z;
__m128* ySSE = (__m128*)y;
__m128* xSSE = (__m128*)x;
__m128* wSSE = (__m128*)w;
for (int index = 0; index < _arraySize / 4; index++)
{
*zSSE = _mm_mul_ps(*ySSE, *xSSE);
*zSSE = _mm_sub_ps(*wSSE, *zSSE);
xSSE++;
ySSE++;
zSSE++;
wSSE++;
}
}
//////////////////////////////////////////////////////////////////////
// SSE set
// X = val
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::setSSE(float* x, float val)
{
__m128* xSSE = (__m128*)x;
for (int index = 0; index < _arraySize / 4; index++)
{
*xSSE = _mm_set_ps1(val);
xSSE++;
}
}
//////////////////////////////////////////////////////////////////////
// SSE set
// X = 0
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::wipeSSE(float* x)
{
__m128* xSSE = (__m128*)x;
for (int index = 0; index < _arraySize / 4; index++)
{
*xSSE = _mm_setzero_ps();
xSSE++;
}
}
//////////////////////////////////////////////////////////////////////
// SSE set
// X = Y
//////////////////////////////////////////////////////////////////////
void CG_SOLVER_SSE::copySSE(float* x, float* y)
{
__m128* ySSE = (__m128*)y;
for (int index = 0; index < _arraySize / 4; index++)
{
_mm_store_ps(x,*ySSE);
x += 4;
ySSE++;
}
}