SIMD or not SIMD - cross platform

后端未结

关注

 6  1622

别跟我提以往 2021-02-04 08:59

I need some idea how to write a C++ cross platform implementation of a few parallelizable problems in a way so I can take advantage of SIMD (SSE, SPU, etc) if available. As well

6条回答

一生所求 (楼主)

2021-02-04 09:17

If someone is interested this is the dirty code I come with to test a new idea that I came with while reading about the library that Paul posted.

Thanks Paul!

// This is just a conceptual test
// I haven't profile the code and I haven't verified if the result is correct
#include 


// This class is doing all the math
template 
class cStreamF32
{
private:
    void*       m_data;
    void*       m_dataEnd;
    __m128*     m_current128;
    float*      m_current32;

public:
    cStreamF32(int size)
    {
        if (SIMD)
            m_data = _mm_malloc(sizeof(float) * size, 16);
        else
            m_data = new float[size];
    }
    ~cStreamF32()
    {
        if (SIMD)
            _mm_free(m_data);
        else
            delete[] (float*)m_data;
    }

    inline void Begin()
    {
        if (SIMD)
            m_current128 = (__m128*)m_data;
        else
            m_current32 = (float*)m_data;
    }

    inline bool Next()
    {
        if (SIMD)
        {
            m_current128++;
            return m_current128 < m_dataEnd;
        }
        else
        {
            m_current32++;
            return m_current32 < m_dataEnd;
        }
    }

    inline void operator=(const __m128 x)
    {
        *m_current128 = x;
    }
    inline void operator=(const float x)
    {
        *m_current32 = x;
    }

    inline __m128 operator+(const cStreamF32& x)
    {
        return _mm_add_ss(*m_current128, *x.m_current128);
    }
    inline float operator+(const cStreamF32& x)
    {
        return *m_current32 + *x.m_current32;
    }

    inline __m128 operator+(const __m128 x)
    {
        return _mm_add_ss(*m_current128, x);
    }
    inline float operator+(const float x)
    {
        return *m_current32 + x;
    }

    inline __m128 operator*(const cStreamF32& x)
    {
        return _mm_mul_ss(*m_current128, *x.m_current128);
    }
    inline float operator*(const cStreamF32& x)
    {
        return *m_current32 * *x.m_current32;
    }

    inline __m128 operator*(const __m128 x)
    {
        return _mm_mul_ss(*m_current128, x);
    }
    inline float operator*(const float x)
    {
        return *m_current32 * x;
    }
};

// Executes both functors
template
void Execute(T1& functor1, T2& functor2)
{
    functor1.Begin();
    do
    {
        functor1.Exec();
    }
    while (functor1.Next());

    functor2.Begin();
    do
    {
        functor2.Exec();
    }
    while (functor2.Next());
}

// This is the implementation of the problem
template 
class cTestFunctor
{
private:
    cStreamF32 a;
    cStreamF32 b;
    cStreamF32 c;

public:
    cTestFunctor() : a(1024), b(1024), c(1024) { }

    inline void Exec()
    {
        c = a + b * a;
    }

    inline void Begin()
    {
        a.Begin();
        b.Begin();
        c.Begin();
    }

    inline bool Next()
    {
        a.Next();
        b.Next();
        return c.Next();
    }
};


int main (int argc, char * const argv[]) 
{
    cTestFunctor functor1;
    cTestFunctor functor2;

    Execute(functor1, functor2);

    return 0;
}

0 讨论(0)

查看其它6个回答