View previous topic :: View next topic |
Author |
Message |
reckless
Joined: 24 Jan 2008 Posts: 390 Location: inside tha debugger
|
Posted: Sun Nov 15, 2009 8:34 pm Post subject: asm memcpy + memset for testing |
|
|
the memcpy aint by me its from berserker quake2
the memset is a port i did from id's unix version for sdl
Code: | typedef enum
{
PRE_READ, // prefetch assuming that buffer is used for reading only
PRE_WRITE, // prefetch assuming that buffer is used for writing only
PRE_READ_WRITE // prefetch assuming that buffer is used for both reading and writing
} e_prefetch;
void Q_Prefetch (const void *s, const unsigned int bytes, e_prefetch type)
{
// write buffer prefetching is performed only if
// the processor benefits from it. Read and read/write
// prefetching is always performed.
switch (type)
{
case PRE_WRITE : break;
case PRE_READ:
case PRE_READ_WRITE:
__asm
{
mov ebx,s
mov ecx,bytes
cmp ecx,4096 // clamp to 4kB
jle skipClamp
mov ecx,4096
skipClamp:
add ecx,0x1f
shr ecx,5 // number of cache lines
jz skip
jmp loopie
align 16
loopie: test byte ptr [ebx],al
add ebx,32
dec ecx
jnz loopie
skip:
}
break;
}
}
// optimized memory copy routine that handles all alignment
// cases and block sizes efficiently
void Q_memcpy (void *dest, const void *src, const size_t count) {
Q_Prefetch (src, count, PRE_READ);
__asm
{
push edi
push esi
mov ecx,count
cmp ecx,0 // count = 0 check (just to be on the safe side)
je outta
mov edx,dest
mov ebx,src
cmp ecx,32 // padding only?
jl padding
mov edi,ecx
and edi,~31 // edi = count&~31
sub edi,32
align 16
loopMisAligned:
mov eax,[ebx + edi + 0 + 0*8]
mov esi,[ebx + edi + 4 + 0*8]
mov [edx+edi+0 + 0*8],eax
mov [edx+edi+4 + 0*8],esi
mov eax,[ebx + edi + 0 + 1*8]
mov esi,[ebx + edi + 4 + 1*8]
mov [edx+edi+0 + 1*8],eax
mov [edx+edi+4 + 1*8],esi
mov eax,[ebx + edi + 0 + 2*8]
mov esi,[ebx + edi + 4 + 2*8]
mov [edx+edi+0 + 2*8],eax
mov [edx+edi+4 + 2*8],esi
mov eax,[ebx + edi + 0 + 3*8]
mov esi,[ebx + edi + 4 + 3*8]
mov [edx+edi+0 + 3*8],eax
mov [edx+edi+4 + 3*8],esi
sub edi,32
jge loopMisAligned
mov edi,ecx
and edi,~31
add ebx,edi // increase src pointer
add edx,edi // increase dst pointer
and ecx,31 // new count
jz outta // if count = 0, get outta here
padding:
cmp ecx,16
jl skip16
mov eax,dword ptr [ebx]
mov dword ptr [edx],eax
mov eax,dword ptr [ebx+4]
mov dword ptr [edx+4],eax
mov eax,dword ptr [ebx+8]
mov dword ptr [edx+8],eax
mov eax,dword ptr [ebx+12]
mov dword ptr [edx+12],eax
sub ecx,16
add ebx,16
add edx,16
skip16:
cmp ecx,8
jl skip8
mov eax,dword ptr [ebx]
mov dword ptr [edx],eax
mov eax,dword ptr [ebx+4]
sub ecx,8
mov dword ptr [edx+4],eax
add ebx,8
add edx,8
skip8:
cmp ecx,4
jl skip4
mov eax,dword ptr [ebx] // here 4-7 bytes
add ebx,4
sub ecx,4
mov dword ptr [edx],eax
add edx,4
skip4: // 0-3 remaining bytes
cmp ecx,2
jl skip2
mov ax,word ptr [ebx] // two bytes
cmp ecx,3 // less than 3?
mov word ptr [edx],ax
jl outta
mov al,byte ptr [ebx+2] // last byte
mov byte ptr [edx+2],al
jmp outta
skip2:
cmp ecx,1
jl outta
mov al,byte ptr [ebx]
mov byte ptr [edx],al
outta:
pop esi
pop edi
}
}
void *Q_memset(void* dest0, int val, size_t count0)
{
union {
byte bytes[8];
unsigned short words[4];
unsigned int dwords[2];
} dat;
byte *dest = (byte *)dest0;
int count = count0;
while( count > 0 && (((int)dest) & 7) ) {
*dest = val;
dest++;
count--;
}
if ( !count ) {
return dest0;
}
dat.bytes[0] = val;
dat.bytes[1] = val;
dat.words[1] = dat.words[0];
dat.dwords[1] = dat.dwords[0];
if ( count >= 64 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 6 // 64 bytes per iteration
movq mm1, dat // Read in source data
movq mm2, mm1
movq mm3, mm1
movq mm4, mm1
movq mm5, mm1
movq mm6, mm1
movq mm7, mm1
movq mm0, mm1
loop1:
movntq 0[edi], mm1 // Non-temporal stores
movntq 8[edi], mm2
movntq 16[edi], mm3
movntq 24[edi], mm4
movntq 32[edi], mm5
movntq 40[edi], mm6
movntq 48[edi], mm7
movntq 56[edi], mm0
add edi, 64
dec ecx
jnz loop1
}
dest += ( count & ~63 );
count &= 63;
}
if ( count >= 8 ) {
__asm {
mov edi, dest
mov ecx, count
shr ecx, 3 // 8 bytes per iteration
movq mm1, dat // Read in source data
loop2:
movntq 0[edi], mm1 // Non-temporal stores
add edi, 8
dec ecx
jnz loop2
}
dest += (count & ~7);
count &= 7;
}
while( count > 0 ) {
*dest = val;
dest++;
count--;
}
__asm emms
return dest0;
} |
they seem to work pretty nicely but not sure if there actually faster. |
|
Back to top |
|
 |
Irritant
Joined: 19 May 2008 Posts: 115 Location: Maryland
|
Posted: Mon Nov 23, 2009 7:18 pm Post subject: |
|
|
The lines in Q_memset starting with movntq are generating the following error "inline assembler syntax error in 'opcode'; found 'constant'"
I suspect it's a compiler issue, as I was using VC 6. _________________ http://red.planetarena.org - Alien Arena |
|
Back to top |
|
 |
reckless
Joined: 24 Jan 2008 Posts: 390 Location: inside tha debugger
|
Posted: Tue Nov 24, 2009 9:59 am Post subject: |
|
|
got the processor pack ? cause the memset function uses sse instructions maybe why the compiler bitches
can try by removing the 64 bit block since thats whats holding the sse instructions. |
|
Back to top |
|
 |
Irritant
Joined: 19 May 2008 Posts: 115 Location: Maryland
|
Posted: Tue Nov 24, 2009 7:48 pm Post subject: |
|
|
reckless wrote: | got the processor pack ? cause the memset function uses sse instructions maybe why the compiler bitches
can try by removing the 64 bit block since thats whats holding the sse instructions. |
No I don't, but does that work on XP and Vista? I had heard there were problems with it.
Also, are there additional dependencies needed in Linux for this? Changes in makefile perhaps?
I'd like to give it a try, but I'm realy scared of breaking stuff... _________________ http://red.planetarena.org - Alien Arena |
|
Back to top |
|
 |
reckless
Joined: 24 Jan 2008 Posts: 390 Location: inside tha debugger
|
Posted: Wed Nov 25, 2009 3:50 am Post subject: |
|
|
aye works just fine you need a hacked sp5 if on vista (prerequisite to the processor pack) tho cause the one from microsoft wont install on vista on xp it works right out of the box got it save and sound here on my ftp cause sometimes i need compatibility libs and NET compilers arent nice in that regard (runtime library hell on ice)
could also download a trial version of intels compiler (still compatible with msvc6)
in fact all ms compilers work fine on vista if you disable UAC even vc7 or the old vc4 hell i even have borland 6 running
there should be a link to my ftp in my thread about quake2xp the package is named vs6sp5_vista.EXE
in regards to windows 7 things get a bit tougher (seems ms is hellbent on killing all there old software)  |
|
Back to top |
|
 |
|
|
You cannot post new topics in this forum You cannot reply to topics in this forum You cannot edit your posts in this forum You cannot delete your posts in this forum You cannot vote in polls in this forum
|
Powered by phpBB © 2004 phpBB Group
|