SIMD Array of structs -> Struct of arrays and vice versa

// 4 128bit registers in SOA to 4 vec4 in AOS
static void soa_to_aos_vec4(    
    __m128 * block,
    glm::vec4 * test_vec
    ){
    float *pvec = (float*) &block[0];
    test_vec[0] = glm::vec4(pvec[3], pvec[7], pvec[11], pvec[15]);
    test_vec[1] = glm::vec4(pvec[2], pvec[6], pvec[10], pvec[14]);
    test_vec[2] = glm::vec4(pvec[1], pvec[5], pvec[9],  pvec[13]);
    test_vec[3] = glm::vec4(pvec[0], pvec[4], pvec[8],  pvec[12]);
}

// in -> 4 vec4 in AOS out-> 4 128bit registers in SOA
static void aos_to_soa_vec4(    
    __m128 * block,
    glm::vec4 * test_vec
    ){

    __m128 xy10, xy32, zw10, zw32;

    xy10 = zw10 = _mm_setzero_ps();
    xy32 = zw32 = _mm_setzero_ps();
    xy10 = _mm_loadl_pi(xy10, (__m64*)&(test_vec[0]).x);
    zw10 = _mm_loadl_pi(zw10, (__m64*)&(test_vec[0]).z);
    xy32 = _mm_loadl_pi(xy32, (__m64*)&(test_vec[2]).x);
    zw32 = _mm_loadl_pi(zw32, (__m64*)&(test_vec[2]).z);
    xy10 = _mm_loadh_pi(xy10, (__m64*)&(test_vec[1]).x);
    zw10 = _mm_loadh_pi(zw10, (__m64*)&(test_vec[1]).z);
    xy32 = _mm_loadh_pi(xy32, (__m64*)&(test_vec[3]).x);
    zw32 = _mm_loadh_pi(zw32, (__m64*)&(test_vec[3]).z);
    block[0] = _mm_shuffle_ps(xy10, xy32, _MM_SHUFFLE(2,0,2,0));
    block[1] = _mm_shuffle_ps(xy10, xy32, _MM_SHUFFLE(3,1,3,1));
    
    block[2] = _mm_shuffle_ps(zw10, zw32, _MM_SHUFFLE(2,0,2,0));
    block[3] = _mm_shuffle_ps(zw10, zw32, _MM_SHUFFLE(3,1,3,1));
}

Morton Codes SIMD

Here’s a SIMD morton code generator. Not as fast as the LUT method (http://www.forceflow.be/2013/10/07/morton-encodingdecoding-through-bit-interleaving-implementations/), but could be extended to produce 4 codes per call.
template<typename T>
void show_binary(const T& a)
{
const char* beg = reinterpret_cast<const char*>(&a);
const char* end = beg + sizeof(a);
while(beg != end)
std::cout << std::bitset<CHAR_BIT>(*beg++) << ‘ ‘;
std::cout << std::endl;
}

static uint32_t encode_morton(int x, int y) {
static const __m128i SB[] = {
_mm_set1_epi32(0x55555555),
_mm_set1_epi32(0x33333333),
_mm_set1_epi32(0x0F0F0F0F),
_mm_set1_epi32(0x00FF00FF)
};
const __m128i x_bit1 = _mm_set1_epi32( x );
const __m128i y_bit1 = _mm_set1_epi32( y );
const __m128i shf_x1 = _mm_slli_epi32(x_bit1, S[3]);
const __m128i shf_y1 = _mm_slli_epi32(x_bit1, S[3]);
const __m128i and_x1 = _mm_and_si128(shf_x1, SB[3]);
const __m128i and_y1 = _mm_and_si128(shf_y1, SB[3]);
const __m128i x_bit2 = _mm_or_si128(x_bit1, and_x1);
const __m128i y_bit2 = _mm_or_si128(y_bit1, and_y1);
const __m128i shf_x2 = _mm_slli_epi32(x_bit2, S[2]);
const __m128i shf_y2 = _mm_slli_epi32(y_bit2, S[2]);
const __m128i and_x2 = _mm_and_si128(shf_x2, SB[2]);
const __m128i and_y2 = _mm_and_si128(shf_y2, SB[2]);
const __m128i x_bit3 = _mm_or_si128(x_bit2, and_x2);
const __m128i y_bit3 = _mm_or_si128(y_bit2, and_y2);
const __m128i shf_x3 = _mm_slli_epi32(x_bit3, S[1]);
const __m128i shf_y3 = _mm_slli_epi32(y_bit3, S[1]);
const __m128i and_x3 = _mm_and_si128(shf_x3, SB[1]);
const __m128i and_y3 = _mm_and_si128(shf_y3, SB[1]);
const __m128i x_bit4 = _mm_or_si128(x_bit3, and_x3);
const __m128i y_bit4 = _mm_or_si128(y_bit3, and_y3);
const __m128i shf_x4 = _mm_slli_epi32(x_bit4, S[0]);
const __m128i shf_y4 = _mm_slli_epi32(y_bit4, S[0]);
const __m128i and_x4 = _mm_and_si128(shf_x4, SB[0]);
const __m128i and_y4 = _mm_and_si128(shf_y4, SB[0]);
const __m128i x_bit5 = _mm_or_si128(x_bit4, and_x4);
const __m128i y_bit5 = _mm_or_si128(y_bit4, and_y4);
const __m128i morton = _mm_or_si128(x_bit5, _mm_slli_epi32(y_bit5,1));

show_binary(morton);
return (((uint32_t*)&morton)[0]);
}

Copy on pastebin:

http://pastebin.com/JfbkFmrc