Sep 19, 2012

Fastest memcpy

Optimized memory copy version. Approx 30-70% faster than memcpy in Microsoft Visual Studio 2005.


void memcpy_sse2(void* dest, const void* src, const unsigned long size_t)
mov esi, src;    //src pointer
mov edi, dest;   //dest pointer

mov ebx, size_t; //ebx is our counter
shr ebx, 7;      //divide by 128 (8 * 128bit registers)

prefetchnta 128[ESI]; //SSE2 prefetch
prefetchnta 160[ESI];
prefetchnta 192[ESI];
prefetchnta 224[ESI];

movdqa xmm0, 0[ESI]; //move data from src to registers
movdqa xmm1, 16[ESI];
movdqa xmm2, 32[ESI];
movdqa xmm3, 48[ESI];
movdqa xmm4, 64[ESI];
movdqa xmm5, 80[ESI];
movdqa xmm6, 96[ESI];
movdqa xmm7, 112[ESI];

movntdq 0[EDI], xmm0; //move data from registers to dest
movntdq 16[EDI], xmm1;
movntdq 32[EDI], xmm2;
movntdq 48[EDI], xmm3;
movntdq 64[EDI], xmm4;
movntdq 80[EDI], xmm5;
movntdq 96[EDI], xmm6;
movntdq 112[EDI], xmm7;

add esi, 128;
add edi, 128;
dec ebx;

jnz loop_copy; //loop please

Courtesy of William Chan