SIMD Array of structs -> Struct of arrays and vice versa

// 4 128bit registers in SOA to 4 vec4 in AOS
static void soa_to_aos_vec4(    
    __m128 * block,
    glm::vec4 * test_vec
    ){
    float *pvec = (float*) &block[0];
    test_vec[0] = glm::vec4(pvec[3], pvec[7], pvec[11], pvec[15]);
    test_vec[1] = glm::vec4(pvec[2], pvec[6], pvec[10], pvec[14]);
    test_vec[2] = glm::vec4(pvec[1], pvec[5], pvec[9],  pvec[13]);
    test_vec[3] = glm::vec4(pvec[0], pvec[4], pvec[8],  pvec[12]);
}

// in -> 4 vec4 in AOS out-> 4 128bit registers in SOA
static void aos_to_soa_vec4(    
    __m128 * block,
    glm::vec4 * test_vec
    ){

    __m128 xy10, xy32, zw10, zw32;

    xy10 = zw10 = _mm_setzero_ps();
    xy32 = zw32 = _mm_setzero_ps();
    xy10 = _mm_loadl_pi(xy10, (__m64*)&(test_vec[0]).x);
    zw10 = _mm_loadl_pi(zw10, (__m64*)&(test_vec[0]).z);
    xy32 = _mm_loadl_pi(xy32, (__m64*)&(test_vec[2]).x);
    zw32 = _mm_loadl_pi(zw32, (__m64*)&(test_vec[2]).z);
    xy10 = _mm_loadh_pi(xy10, (__m64*)&(test_vec[1]).x);
    zw10 = _mm_loadh_pi(zw10, (__m64*)&(test_vec[1]).z);
    xy32 = _mm_loadh_pi(xy32, (__m64*)&(test_vec[3]).x);
    zw32 = _mm_loadh_pi(zw32, (__m64*)&(test_vec[3]).z);
    block[0] = _mm_shuffle_ps(xy10, xy32, _MM_SHUFFLE(2,0,2,0));
    block[1] = _mm_shuffle_ps(xy10, xy32, _MM_SHUFFLE(3,1,3,1));
    
    block[2] = _mm_shuffle_ps(zw10, zw32, _MM_SHUFFLE(2,0,2,0));
    block[3] = _mm_shuffle_ps(zw10, zw32, _MM_SHUFFLE(3,1,3,1));
}

Leave a comment