What about this, using SSE2 assembly (32 bits):
void Zero(void* Buffer, int Count)
{
char* Cur= (char*)Buffer;
char* End= (char*)Buffer + Count;
while (Cur < End && (Cur - (char*)0) & 0xf)
{
*Cur++= 0;
}
_asm
{
pxor xmm0, xmm0; mov eax, Cur;
mov ebx, End;
and ebx, ~0xf;
While:
cmp eax, ebx; jnb Wend;
movapd [eax], xmm0; add eax, 16; jmp While
Wend:
mov Cur, eax;
}
while (Cur < End)
{
*Cur++= 0;
}
}