SIMD or not SIMD - cross platform

后端 未结 6 1609
别跟我提以往
别跟我提以往 2021-02-04 08:59

I need some idea how to write a C++ cross platform implementation of a few parallelizable problems in a way so I can take advantage of SIMD (SSE, SPU, etc) if available. As well

6条回答
  •  一生所求
    2021-02-04 09:17

    If someone is interested this is the dirty code I come with to test a new idea that I came with while reading about the library that Paul posted.

    Thanks Paul!

    // This is just a conceptual test
    // I haven't profile the code and I haven't verified if the result is correct
    #include 
    
    
    // This class is doing all the math
    template 
    class cStreamF32
    {
    private:
        void*       m_data;
        void*       m_dataEnd;
        __m128*     m_current128;
        float*      m_current32;
    
    public:
        cStreamF32(int size)
        {
            if (SIMD)
                m_data = _mm_malloc(sizeof(float) * size, 16);
            else
                m_data = new float[size];
        }
        ~cStreamF32()
        {
            if (SIMD)
                _mm_free(m_data);
            else
                delete[] (float*)m_data;
        }
    
        inline void Begin()
        {
            if (SIMD)
                m_current128 = (__m128*)m_data;
            else
                m_current32 = (float*)m_data;
        }
    
        inline bool Next()
        {
            if (SIMD)
            {
                m_current128++;
                return m_current128 < m_dataEnd;
            }
            else
            {
                m_current32++;
                return m_current32 < m_dataEnd;
            }
        }
    
        inline void operator=(const __m128 x)
        {
            *m_current128 = x;
        }
        inline void operator=(const float x)
        {
            *m_current32 = x;
        }
    
        inline __m128 operator+(const cStreamF32& x)
        {
            return _mm_add_ss(*m_current128, *x.m_current128);
        }
        inline float operator+(const cStreamF32& x)
        {
            return *m_current32 + *x.m_current32;
        }
    
        inline __m128 operator+(const __m128 x)
        {
            return _mm_add_ss(*m_current128, x);
        }
        inline float operator+(const float x)
        {
            return *m_current32 + x;
        }
    
        inline __m128 operator*(const cStreamF32& x)
        {
            return _mm_mul_ss(*m_current128, *x.m_current128);
        }
        inline float operator*(const cStreamF32& x)
        {
            return *m_current32 * *x.m_current32;
        }
    
        inline __m128 operator*(const __m128 x)
        {
            return _mm_mul_ss(*m_current128, x);
        }
        inline float operator*(const float x)
        {
            return *m_current32 * x;
        }
    };
    
    // Executes both functors
    template
    void Execute(T1& functor1, T2& functor2)
    {
        functor1.Begin();
        do
        {
            functor1.Exec();
        }
        while (functor1.Next());
    
        functor2.Begin();
        do
        {
            functor2.Exec();
        }
        while (functor2.Next());
    }
    
    // This is the implementation of the problem
    template 
    class cTestFunctor
    {
    private:
        cStreamF32 a;
        cStreamF32 b;
        cStreamF32 c;
    
    public:
        cTestFunctor() : a(1024), b(1024), c(1024) { }
    
        inline void Exec()
        {
            c = a + b * a;
        }
    
        inline void Begin()
        {
            a.Begin();
            b.Begin();
            c.Begin();
        }
    
        inline bool Next()
        {
            a.Next();
            b.Next();
            return c.Next();
        }
    };
    
    
    int main (int argc, char * const argv[]) 
    {
        cTestFunctor functor1;
        cTestFunctor functor2;
    
        Execute(functor1, functor2);
    
        return 0;
    }
    

提交回复
热议问题