This is the right sequence of opcodes to obtain what you want:
__asm
{
movq mm0, B2
movq mm1, B2+8
movq mm2, B2+16
movq mm3, B2+24
paddw mm0, mm3
paddw mm1, mm2
movq B3, mm0
movq B3+8, mm1
movq mm0, B2
movq mm1, B2+8
psubw mm1, mm2
psubw mm0, mm3
movq B3+16, mm1
movq B3+24, mm0
emms
}