I need some idea how to write a C++ cross platform implementation of a few parallelizable problems in a way so I can take advantage of SIMD (SSE, SPU, etc) if available. As well
If someone is interested this is the dirty code I come with to test a new idea that I came with while reading about the library that Paul posted.
Thanks Paul!
// This is just a conceptual test
// I haven't profile the code and I haven't verified if the result is correct
#include
// This class is doing all the math
template
class cStreamF32
{
private:
void* m_data;
void* m_dataEnd;
__m128* m_current128;
float* m_current32;
public:
cStreamF32(int size)
{
if (SIMD)
m_data = _mm_malloc(sizeof(float) * size, 16);
else
m_data = new float[size];
}
~cStreamF32()
{
if (SIMD)
_mm_free(m_data);
else
delete[] (float*)m_data;
}
inline void Begin()
{
if (SIMD)
m_current128 = (__m128*)m_data;
else
m_current32 = (float*)m_data;
}
inline bool Next()
{
if (SIMD)
{
m_current128++;
return m_current128 < m_dataEnd;
}
else
{
m_current32++;
return m_current32 < m_dataEnd;
}
}
inline void operator=(const __m128 x)
{
*m_current128 = x;
}
inline void operator=(const float x)
{
*m_current32 = x;
}
inline __m128 operator+(const cStreamF32& x)
{
return _mm_add_ss(*m_current128, *x.m_current128);
}
inline float operator+(const cStreamF32& x)
{
return *m_current32 + *x.m_current32;
}
inline __m128 operator+(const __m128 x)
{
return _mm_add_ss(*m_current128, x);
}
inline float operator+(const float x)
{
return *m_current32 + x;
}
inline __m128 operator*(const cStreamF32& x)
{
return _mm_mul_ss(*m_current128, *x.m_current128);
}
inline float operator*(const cStreamF32& x)
{
return *m_current32 * *x.m_current32;
}
inline __m128 operator*(const __m128 x)
{
return _mm_mul_ss(*m_current128, x);
}
inline float operator*(const float x)
{
return *m_current32 * x;
}
};
// Executes both functors
template
void Execute(T1& functor1, T2& functor2)
{
functor1.Begin();
do
{
functor1.Exec();
}
while (functor1.Next());
functor2.Begin();
do
{
functor2.Exec();
}
while (functor2.Next());
}
// This is the implementation of the problem
template
class cTestFunctor
{
private:
cStreamF32 a;
cStreamF32 b;
cStreamF32 c;
public:
cTestFunctor() : a(1024), b(1024), c(1024) { }
inline void Exec()
{
c = a + b * a;
}
inline void Begin()
{
a.Begin();
b.Begin();
c.Begin();
}
inline bool Next()
{
a.Next();
b.Next();
return c.Next();
}
};
int main (int argc, char * const argv[])
{
cTestFunctor functor1;
cTestFunctor functor2;
Execute(functor1, functor2);
return 0;
}