al/dspSSE.cpp

//=========================================================
//  OOMidi
//  OpenOctave Midi and Audio Editor
//  $Id: dspSSE.cpp,v 1.1.2.3 2009/12/20 00:04:25 spamatica Exp $
//
//  (C) Copyright 2007-2009 Werner Schweer (ws@seh.de)
//      file originally from Ardour DAW project by Paul Davis (c) 2005
//      licensed through GPL
//    	Original author Sampo Savolainen
//
//  This program is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License version 2.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program; if not, write to the Free Software
//  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
//======================================================================

#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, long nframes, float gain);

.globl x86_sse_mix_buffers_with_gain
.type x86_sse_mix_buffers_with_gain, @function

x86_sse_mix_buffers_with_gain :
#; 8(%ebp)	= float	*dst 	= %edi
#; 12(%ebp) = float *src	= %esi
#; 16(%ebp) = long	nframes = %ecx
#; 20(%ebp) = float	gain    = st(0)

pushl % ebp
movl % esp, % ebp

#; save the registers
#;	pushl %eax
pushl % ebx
#;	pushl %ecx
pushl % edi
pushl % esi

#; if nframes == 0, go to end
movl 16( % ebp), % ecx#; nframes
cmp $0, % ecx
je .MBWG_END

#; Check for alignment

movl 8( % ebp), % edi#; dst
movl 12( % ebp), % esi#; src

movl % edi, % eax
andl $12, % eax#; mask alignemnt offset

movl % esi, % ebx
andl $12, % ebx#; mask alignment offset

cmp % eax, % ebx
jne .MBWG_NONALIGN#; if not aligned, calculate manually

#; if we are aligned
cmp $0, % ebx
jz .MBWG_SSE

#; Pre-loop, we need to run 1-3 frames "manually" without
#; SSE instructions

movss 20( % ebp), % xmm1#; xmm1

.MBWG_PRELOOP :

movss( % esi), % xmm0
mulss % xmm1, % xmm0
addss( % edi), % xmm0
movss % xmm0, ( % edi)

addl $4, % edi#; dst++
addl $4, % esi#; src++
decl % ecx#; nframes--
jz .MBWG_END

#;	cmp $0, %ecx
#;	je .MBWG_END #; if we run out of frames, go to end

addl $4, % ebx

cmp $16, % ebx#; test if we've reached 16 byte alignment
jne .MBWG_PRELOOP


.MBWG_SSE :

cmp $4, % ecx#; we know it's not zero, but if it's not >=4, then
jnge .MBWG_NONALIGN#; we jump straight to the "normal" code

#; copy gain to fill %xmm1
movss 20( % ebp), % xmm1
shufps $0x00, % xmm1, % xmm1


.MBWG_SSELOOP :

movaps( % esi), % xmm0#; source => xmm0
mulps % xmm1, % xmm0#; apply gain to source
addps( % edi), % xmm0#; mix with destination
movaps % xmm0, ( % edi)#; copy result to destination

addl $16, % edi#; dst+=4
addl $16, % esi#; src+=4

subl $4, % ecx#; nframes-=4
cmp $4, % ecx
jge .MBWG_SSELOOP

cmp $0, % ecx
je .MBWG_END

#; if there are remaining frames, the nonalign code will do nicely
#; for the rest 1-3 frames.

.MBWG_NONALIGN :
#; not aligned!

movss 20( % ebp), % xmm1#; gain => xmm1

.MBWG_NONALIGNLOOP :

movss( % esi), % xmm0
mulss % xmm1, % xmm0
addss( % edi), % xmm0
movss % xmm0, ( % edi)

addl $4, % edi
addl $4, % esi

decl % ecx
jnz .MBWG_NONALIGNLOOP

.MBWG_END :

popl % esi
popl % edi
#;	popl %ecx
popl % ebx
#;	popl %eax

#; return
leave
ret

.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain


#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, long nframes);

.globl x86_sse_mix_buffers_no_gain
.type x86_sse_mix_buffers_no_gain, @function

x86_sse_mix_buffers_no_gain :
#; 8(%ebp)	= float	*dst 	= %edi
#; 12(%ebp) = float *src	= %esi
#; 16(%ebp) = long	nframes = %ecx

pushl % ebp
movl % esp, % ebp

#; save the registers
#;	pushl %eax
pushl % ebx
#;	pushl %ecx
pushl % edi
pushl % esi

#; the real function

#; if nframes == 0, go to end
movl 16( % ebp), % ecx#; nframes
cmp $0, % ecx
je .MBNG_END

#; Check for alignment

movl 8( % ebp), % edi#; dst
movl 12( % ebp), % esi#; src

movl % edi, % eax
andl $12, % eax#; mask alignemnt offset

movl % esi, % ebx
andl $12, % ebx#; mask alignment offset

cmp % eax, % ebx
jne .MBNG_NONALIGN#; if not aligned, calculate manually

cmp $0, % ebx
je .MBNG_SSE

#; Pre-loop, we need to run 1-3 frames "manually" without
#; SSE instructions

.MBNG_PRELOOP :

movss( % esi), % xmm0
addss( % edi), % xmm0
movss % xmm0, ( % edi)

addl $4, % edi#; dst++
addl $4, % esi#; src++
decl % ecx#; nframes--
jz .MBNG_END
addl $4, % ebx

cmp $16, % ebx#; test if we've reached 16 byte alignment
jne .MBNG_PRELOOP

.MBNG_SSE :

cmp $4, % ecx#; if there are frames left, but less than 4
jnge .MBNG_NONALIGN#; we can't run SSE

.MBNG_SSELOOP :

movaps( % esi), % xmm0#; source => xmm0
addps( % edi), % xmm0#; mix with destination
movaps % xmm0, ( % edi)#; copy result to destination

addl $16, % edi#; dst+=4
addl $16, % esi#; src+=4

subl $4, % ecx#; nframes-=4
cmp $4, % ecx
jge .MBNG_SSELOOP

cmp $0, % ecx
je .MBNG_END

#; if there are remaining frames, the nonalign code will do nicely
#; for the rest 1-3 frames.

.MBNG_NONALIGN :
#; not aligned!

movss( % esi), % xmm0#; src => xmm0
addss( % edi), % xmm0#; xmm0 += dst
movss % xmm0, ( % edi)#; xmm0 => dst

addl $4, % edi
addl $4, % esi

decl % ecx
jnz .MBNG_NONALIGN

.MBNG_END :

popl % esi
popl % edi
#;	popl %ecx
popl % ebx
#;	popl %eax

#; return
leave
ret

.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain


#; void x86_sse_apply_gain_to_buffer (float *buf, long nframes, float gain);

.globl x86_sse_apply_gain_to_buffer
.type x86_sse_apply_gain_to_buffer, @function

x86_sse_apply_gain_to_buffer :
#; 8(%ebp)	= float	*buf 	= %edi
#; 12(%ebp) = long	nframes = %ecx
#; 16(%ebp) = float	gain    = st(0)

pushl % ebp
movl % esp, % ebp

#; save %edi
pushl % edi

#; the real function

#; if nframes == 0, go to end
movl 12( % ebp), % ecx#; nframes
cmp $0, % ecx
je .AG_END

#; create the gain buffer in %xmm1
movss 16( % ebp), % xmm1
shufps $0x00, % xmm1, % xmm1

#; Check for alignment

movl 8( % ebp), % edi#; buf
movl % edi, % edx#; buf => %edx
andl $12, % edx#; mask bits 1 & 2, result = 0, 4, 8 or 12
jz .AG_SSE#; if buffer IS aligned

#; PRE-LOOP
#; we iterate 1-3 times, doing normal x87 float comparison
#; so we reach a 16 byte aligned "buf" (=%edi) value

.AGLP_START :

#; Load next value from the buffer
movss( % edi), % xmm0
mulss % xmm1, % xmm0
movss % xmm0, ( % edi)

#; increment buffer, decrement counter
addl $4, % edi#; buf++;

decl % ecx#; nframes--
jz .AG_END#; if we run out of frames, we go to the end

addl $4, % edx#; one non-aligned byte less
cmp $16, % edx
jne .AGLP_START#; if more non-aligned frames exist, we do a do-over

.AG_SSE :

#; We have reached the 16 byte aligned "buf" ("edi") value

#; Figure out how many loops we should do
movl % ecx, % eax#; copy remaining nframes to %eax for division
movl $0, % edx#; 0 the edx register


pushl % edi
movl $4, % edi
divl % edi#; %edx = remainder == 0
popl % edi

#; %eax = SSE iterations
cmp $0, % eax
je .AGPOST_START


.AGLP_SSE :

movaps( % edi), % xmm0
mulps % xmm1, % xmm0
movaps % xmm0, ( % edi)

addl $16, % edi
#;	subl $4, %ecx   #; nframes-=4

decl % eax
jnz .AGLP_SSE

#; Next we need to post-process all remaining frames
#; the remaining frame count is in %ecx

#; if no remaining frames, jump to the end
#;	cmp $0, %ecx
andl $3, % ecx#; nframes % 4
je .AG_END

.AGPOST_START :

movss( % edi), % xmm0
mulss % xmm1, % xmm0
movss % xmm0, ( % edi)

#; increment buffer, decrement counter
addl $4, % edi#; buf++;

decl % ecx#; nframes--
jnz .AGPOST_START#; if we run out of frames, we go to the end

.AG_END :


popl % edi

#; return
leave
ret

.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
#; end proc


#; float x86_sse_compute_peak(float *buf, long nframes, float current);

.globl x86_sse_compute_peak
.type x86_sse_compute_peak, @function

x86_sse_compute_peak :
#; 8(%ebp)	= float	*buf 	= %edi
#; 12(%ebp) = long	nframes = %ecx
#; 16(%ebp) = float	current = st(0)

pushl % ebp
movl % esp, % ebp

#; save %edi
pushl % edi

#; the real function

#; Load "current" in xmm0
movss 16( % ebp), % xmm0

#; if nframes == 0, go to end
movl 12( % ebp), % ecx#; nframes
cmp $0, % ecx
je .CP_END

#; create the "abs" mask in %xmm2
pushl $2147483647
movss( % esp), % xmm2
addl $4, % esp
shufps $0x00, % xmm2, % xmm2

#; Check for alignment

movl 8( % ebp), % edi#; buf
movl % edi, % edx#; buf => %edx
andl $12, % edx#; mask bits 1 & 2, result = 0, 4, 8 or 12
jz .CP_SSE#; if buffer IS aligned

#; PRE-LOOP
#; we iterate 1-3 times, doing normal x87 float comparison
#; so we reach a 16 byte aligned "buf" (=%edi) value

.LP_START :

#; Load next value from the buffer
movss( % edi), % xmm1
andps % xmm2, % xmm1
maxss % xmm1, % xmm0

#; increment buffer, decrement counter
addl $4, % edi#; buf++;

decl % ecx#; nframes--
jz .CP_END#; if we run out of frames, we go to the end

addl $4, % edx#; one non-aligned byte less
cmp $16, % edx
jne .LP_START#; if more non-aligned frames exist, we do a do-over

.CP_SSE :

#; We have reached the 16 byte aligned "buf" ("edi") value

#; Figure out how many loops we should do
movl % ecx, % eax#; copy remaining nframes to %eax for division

shr $2, % eax#; unsigned divide by 4
jz .POST_START

#; %eax = SSE iterations

#; current maximum is at %xmm0, but we need to ..
shufps $0x00, % xmm0, % xmm0#; shuffle "current" to all 4 FP's

#;prefetcht0 16(%edi)

.LP_SSE :

movaps( % edi), % xmm1
andps % xmm2, % xmm1
maxps % xmm1, % xmm0

addl $16, % edi

decl % eax
jnz .LP_SSE

#; Calculate the maximum value contained in the 4 FP's in %xmm0
movaps % xmm0, % xmm1
shufps $0x4e, % xmm1, % xmm1#; shuffle left & right pairs (1234 => 3412)
maxps % xmm1, % xmm0#; maximums of the two pairs
movaps % xmm0, % xmm1
shufps $0xb1, % xmm1, % xmm1#; shuffle the floats inside the two pairs (1234 => 2143)
maxps % xmm1, % xmm0

#; now every float in %xmm0 is the same value, current maximum value

#; Next we need to post-process all remaining frames
#; the remaining frame count is in %ecx

#; if no remaining frames, jump to the end

andl $3, % ecx#; nframes % 4
jz .CP_END

.POST_START :

movss( % edi), % xmm1
andps % xmm2, % xmm1
maxss % xmm1, % xmm0

addl $4, % edi#; buf++;

decl % ecx#; nframes--;
jnz .POST_START

.CP_END :

#; Load the value from xmm0 to the float stack for returning
movss % xmm0, 16( % ebp)
flds 16( % ebp)

popl % edi

#; return
leave
ret

.size x86_sse_compute_peak, .-x86_sse_compute_peak
#; end proc

#ifdef __ELF__
.section .note.GNU - stack, "", % progbits
#endif