// 4 128bit registers in SOA to 4 vec4 in AOS
static void soa_to_aos_vec4(
__m128 * block,
glm::vec4 * test_vec
){
float *pvec = (float*) &block[0];
test_vec[0] = glm::vec4(pvec[3], pvec[7], pvec[11], pvec[15]);
test_vec[1] = glm::vec4(pvec[2], pvec[6], pvec[10], pvec[14]);
test_vec[2] = glm::vec4(pvec[1], pvec[5], pvec[9], pvec[13]);
test_vec[3] = glm::vec4(pvec[0], pvec[4], pvec[8], pvec[12]);
}
// in -> 4 vec4 in AOS out-> 4 128bit registers in SOA
static void aos_to_soa_vec4(
__m128 * block,
glm::vec4 * test_vec
){
__m128 xy10, xy32, zw10, zw32;
xy10 = zw10 = _mm_setzero_ps();
xy32 = zw32 = _mm_setzero_ps();
xy10 = _mm_loadl_pi(xy10, (__m64*)&(test_vec[0]).x);
zw10 = _mm_loadl_pi(zw10, (__m64*)&(test_vec[0]).z);
xy32 = _mm_loadl_pi(xy32, (__m64*)&(test_vec[2]).x);
zw32 = _mm_loadl_pi(zw32, (__m64*)&(test_vec[2]).z);
xy10 = _mm_loadh_pi(xy10, (__m64*)&(test_vec[1]).x);
zw10 = _mm_loadh_pi(zw10, (__m64*)&(test_vec[1]).z);
xy32 = _mm_loadh_pi(xy32, (__m64*)&(test_vec[3]).x);
zw32 = _mm_loadh_pi(zw32, (__m64*)&(test_vec[3]).z);
block[0] = _mm_shuffle_ps(xy10, xy32, _MM_SHUFFLE(2,0,2,0));
block[1] = _mm_shuffle_ps(xy10, xy32, _MM_SHUFFLE(3,1,3,1));
block[2] = _mm_shuffle_ps(zw10, zw32, _MM_SHUFFLE(2,0,2,0));
block[3] = _mm_shuffle_ps(zw10, zw32, _MM_SHUFFLE(3,1,3,1));
}