1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
|
/*
* (C) Gražvydas "notaz" Ignotas, 2011
*
* This work is licensed under the terms of any of these licenses
* (at your option):
* - GNU GPL, version 2 or later.
* - GNU LGPL, version 2.1 or later.
* See the COPYING file in the top-level directory.
*/
#include "arm_features.h"
#ifdef __MACH__
.data
.align 2
ptr_ChanBuf: .word ESYM(ChanBuf)
ptr_SSumLR: .word ESYM(SSumLR)
ptr_sRVBStart: .word ESYM(sRVBStart)
#endif
.text
.align 2
.macro load_varadr reg var
#if defined(__ARM_ARCH_7A__) && !defined(__PIC__)
movw \reg, #:lower16:ESYM(\var)
movt \reg, #:upper16:ESYM(\var)
#elif defined(__ARM_ARCH_7A__) && defined(__MACH__)
movw \reg, #:lower16:(ptr_\var-(1678f+8))
movt \reg, #:upper16:(ptr_\var-(1678f+8))
1678:
ldr \reg, [pc, \reg]
#else
ldr \reg, =ESYM(\var)
#endif
.endm
#ifdef __ARM_NEON__
FUNCTION(mix_chan): @ (int start, int count, int lv, int rv)
vmov.32 d14[0], r2
vmov.32 d14[1], r3 @ multipliers
mov r12, r0
load_varadr r0, ChanBuf
load_varadr r2, SSumLR
add r0, r12, lsl #2
add r2, r12, lsl #3
0:
vldmia r0!, {d0-d1}
vldmia r2, {d2-d5}
vmul.s32 d10, d14, d0[0]
vmul.s32 d11, d14, d0[1]
vmul.s32 d12, d14, d1[0]
vmul.s32 d13, d14, d1[1]
vsra.s32 q1, q5, #14
vsra.s32 q2, q6, #14
subs r1, #4
blt mc_finish
vstmia r2!, {d2-d5}
bgt 0b
nop
bxeq lr
mc_finish:
vstmia r2!, {d2}
cmp r1, #-2
vstmiage r2!, {d3}
cmp r1, #-1
vstmiage r2!, {d4}
bx lr
FUNCTION(mix_chan_rvb): @ (int start, int count, int lv, int rv)
vmov.32 d14[0], r2
vmov.32 d14[1], r3 @ multipliers
mov r12, r0
load_varadr r0, ChanBuf
load_varadr r3, sRVBStart
load_varadr r2, SSumLR
ldr r3, [r3]
add r0, r12, lsl #2
add r2, r12, lsl #3
add r3, r12, lsl #3
0:
vldmia r0!, {d0-d1}
vldmia r2, {d2-d5}
vldmia r3, {d6-d9}
vmul.s32 d10, d14, d0[0]
vmul.s32 d11, d14, d0[1]
vmul.s32 d12, d14, d1[0]
vmul.s32 d13, d14, d1[1]
vsra.s32 q1, q5, #14
vsra.s32 q2, q6, #14
vsra.s32 q3, q5, #14
vsra.s32 q4, q6, #14
subs r1, #4
blt mcr_finish
vstmia r2!, {d2-d5}
vstmia r3!, {d6-d9}
bgt 0b
nop
bxeq lr
mcr_finish:
vstmia r2!, {d2}
vstmia r3!, {d6}
cmp r1, #-2
vstmiage r2!, {d3}
vstmiage r3!, {d7}
cmp r1, #-1
vstmiage r2!, {d4}
vstmiage r3!, {d8}
bx lr
#elif defined(HAVE_ARMV5)
FUNCTION(mix_chan): @ (int start, int count, int lv, int rv)
stmfd sp!, {r4-r8,lr}
orr r3, r2, r3, lsl #16
lsl r3, #1 @ packed multipliers << 1
mov r12, r0
load_varadr r0, ChanBuf
load_varadr r2, SSumLR
add r0, r12, lsl #2
add r2, r12, lsl #3
0:
ldmia r0!, {r4,r5}
ldmia r2, {r6-r8,lr}
lsl r4, #1 @ adjust for mul
lsl r5, #1
smlawb r6, r4, r3, r6
smlawt r7, r4, r3, r7
smlawb r8, r5, r3, r8
smlawt lr, r5, r3, lr
subs r1, #2
blt mc_finish
stmia r2!, {r6-r8,lr}
bgt 0b
ldmeqfd sp!, {r4-r8,pc}
mc_finish:
stmia r2!, {r6,r7}
ldmfd sp!, {r4-r8,pc}
FUNCTION(mix_chan_rvb): @ (int start, int count, int lv, int rv)
stmfd sp!, {r4-r8,lr}
orr lr, r2, r3, lsl #16
lsl lr, #1
load_varadr r3, sRVBStart
load_varadr r2, SSumLR
load_varadr r4, ChanBuf
ldr r3, [r3]
add r2, r2, r0, lsl #3
add r3, r3, r0, lsl #3
add r0, r4, r0, lsl #2
0:
ldr r4, [r0], #4
ldmia r2, {r6,r7}
ldmia r3, {r8,r12}
lsl r4, #1
smlawb r6, r4, lr, r6 @ supposedly takes single cycle?
smlawt r7, r4, lr, r7
smlawb r8, r4, lr, r8
smlawt r12,r4, lr, r12
subs r1, #1
stmia r2!, {r6,r7}
stmia r3!, {r8,r12}
bgt 0b
ldmfd sp!, {r4-r8,pc}
#endif
@ vim:filetype=armasm
|