1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
|
/*
* (C) Gražvydas "notaz" Ignotas, 2011
*
* This work is licensed under the terms of GNU GPL version 2 or later.
* See the COPYING file in the top-level directory.
*/
.bss
.align 6 @ cacheline
scratch:
.rept 8*8*2/4
.word 0
.endr
.text
.align 2
@ XXX: gteMAC calc shouldn't be saturating, but it is here
@ approximate gteMAC|123 flags
@ in: rr 123 as gteMAC|123
@ trash: nothing
.macro do_mac_flags rr1 rr2 rr3
cmp \rr1, #1
orrvs lr, #(1<<31)|(1<<27)
cmp \rr2, #1
orrvs lr, #(1<<31)|(1<<26)
cmp \rr3, #1
orrvs lr, #(1<<31)|(1<<25)
cmn \rr1, #1 @ same as adds ...
orrvs lr, #(1<<30)
cmn \rr2, #1
orrvs lr, #(1<<29)
cmn \rr3, #1
orrvs lr, #(1<<28)
.endm
@ approximate 3x gteMACn flags
@ in: rr 123 as 3 instances gteMACn, *flags
@ trash: nothing
.macro do_mac_flags3x rr1 rr2 rr3 nflags pflags
cmp \rr1, #1
cmpvc \rr2, #1
cmpvc \rr3, #1
orrvs lr, #\nflags
cmn \rr1, #1 @ adds ...
cmnvc \rr2, #1
cmnvc \rr3, #1
orrvs lr, #\pflags
.endm
@ get gteIR|123 flags from gteMAC|123
@ in: rr 123 as gteMAC|123
@ trash: r2,r3
.macro do_irs_flags rr1 rr2 rr3
add r2, \rr1, #0x8000
add r3, \rr2, #0x8000
lsrs r2, #16
orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
lsrs r3, #16
add r2, \rr3, #0x8000
orrne lr, #(1<<31)
orrne lr, #(1<<23) @ IR2/limB2
lsrs r2, #16
orrne lr, #(1<<22) @ IR3/limB3
.endm
/*
* RTPS/RTPT register map:
*
* q | d | c code / phase 1 phase 2 scratch
* 0 0 gteR1* [s16] gteMAC3 = gteMAC3 \ v=0 *
* 1 gteR2* gteIR1-3 = gteIR1-3 / *
* 1 2 gteR3* gteMAC3 = gteMAC3 \ v=1
* 3 * gteIR1-3 = gteIR1-3 /
* 2 4 gteTRX<<12 [s64] gteOFX [s64] gteMAC3 \ v=2
* 5 gteTRY<<12 gteOFY [s64] gteIR1-3 /
* 3 6 gteTRZ<<12 gteDQA [s64] min gteMAC|12 v=012
* 7 0 gteDQB [s64] max gteMAC|12
* 4 8 VXYZ(v) / gteMAC1,2 [s32] min gteIR|123
* 9 * / gteMAC3 max gteIR|123
* 5 10 gteIR1-3 [s16] gteIR1-3 v=2 quotients 12
* 11 0 quotient 3
* 6 12 gteH (adj. for cmp)
* 13 gteH (float for div)
* ... <scratch>
* 15 30 0
* 31 0
*/
@ load gteR*, gteTR* and gteH (see map above), clear q15
@ in: r0 - context
@ trash: r3
.macro rtpx_preload
add r3, r0, #4*32
vldmia r3, {d0-d2} @ gteR* [16*9]
vmov.i32 q15, #0
add r3, r0, #4*(32+5)
vldmia r3, {d4-d5} @ gteTR*
vext.16 d2, d1, d2, #2 @ xxx3 -> x321
vext.16 d1, d0, d1, #3 @ xx32 -> x321
add r3, r0, #4*(32+26)
vld1.32 d11[0], [r3] @ gteH
vshll.s32 q3, d5, #12 @ gteTRZ
vshll.s32 q2, d4, #12 @ gteTR|XY
vmovl.s16 q6, d11 @ gteH
.endm
@ do RTP* gteMAC* calculation
@ in: gteR*, gteTR* as in map, d8 - VXYZ, r12 - 0
@ out: d8,d9 - gteMAC|123, d10 - gteIR|123
@ trash: d16-d21
.macro rtpx_mac
vmull.s16 q8, d0, d8
vmull.s16 q9, d1, d8
vmull.s16 q10, d2, d8
vpaddl.s32 q8, q8
vpaddl.s32 q9, q9
vpaddl.s32 q10, q10
vadd.s64 d16, d17 @ d16=d0.16[2]*d8.16[2], as
vadd.s64 d18, d19 @ d8[3]==0, so won't affect
vadd.s64 d20, d21 @ QC
vadd.s64 d16, d4
vadd.s64 d18, d5
vadd.s64 d20, d6
vqshrn.s64 d8, q8, #12 @ gteMAC1
vqshrn.s64 d18, q9, #12 @ gteMAC2
vqshrn.s64 d9, q10, #12 @ gteMAC3
vsli.u64 d8, d18, #32 @ gteMAC|12
vmov.32 d9[1], r12
vqmovn.s32 d10, q4 @ gteIR|123; losing 2 cycles?
.endm
.global gteRTPS_neon @ r0=CP2 (d,c),
gteRTPS_neon:
push {r4-r6,lr}
@ fmrx r4, fpscr @ vmrs? at least 40 cycle hit
movw r1, #:lower16:scratch
movt r1, #:upper16:scratch
mov r12, #0
vldmia r0, {d8} @ VXYZ(0)
rtpx_preload
@ rtpx_mac @ slower here, faster in RTPT?
vmov.16 d8[3], r12 @ kill unused upper vector
vmull.s16 q8, d0, d8
vmull.s16 q9, d1, d8
vmull.s16 q10, d2, d8
vpadd.s32 d16, d16, d17
vpadd.s32 d17, d18, d19
vpadd.s32 d18, d20, d21
vpadal.s32 q2, q8
vpadal.s32 q3, q9 @ d6, d18 is slow?
vqshrn.s64 d8, q2, #12 @ gteMAC|12
vqshrn.s64 d9, q3, #12 @ gteMAC3
add r3, r0, #4*25
vst1.32 d8, [r3]!
vst1.32 d9[0], [r3] @ wb gteMAC|123
vqmovn.s32 d10, q4 @ gteIR|123
add r3, r0, #4*17 @ gteSZ*
vldmia r3, {q7} @ d14,d15 gteSZ|123x
vmov.i32 d28, #0xffff @ 0xffff[32]
vmax.s32 d11, d9, d31
vshr.s32 d16, d12, #1 @ | gteH/2 (adjust for cmp)
vmov.i32 d26, #1
vmin.u32 d11, d28 @ saturate to 0..0xffff limD/fSZ3
vmovl.s16 q9, d10 @ || expand gteIR|123
vshl.u32 d13, d12, #16 @ | preparing gteH
add r3, r0, #4*9
vst1.32 d18, [r3]!
vst1.32 d19[0], [r3]
vsli.u64 d15, d11, #32 @ new gteSZ|0123 in q7
vclt.u32 d16, d16, d11 @ gteH/2 < fSZ3?
add r3, r0, #4*(32+24)
vld1.32 d4, [r3] @ || gteOF|XY
add r3, r0, #4*(32+27)
vld1.32 d6, [r3] @ || gteDQ|AB
vand d11, d16
vmovl.s32 q2, d4 @ || gteOF|XY [64]
vmax.u32 d11, d26 @ make divisor 1 if not
vmovl.s32 q3, d6 @ || gteDQ|AB [64]
add r3, r0, #4*16 @ | gteSZ*
vstmia r3, {q7} @ | d14,d15 gteSZ|123x
vcvt.f32.u32 d13, d13 @ gteH (float for div)
vcvt.f32.u32 d11, d11 @ divisor
@ divide.. it's not worth messing with reciprocals here
@ just for 1 value, let's just use VFP divider here
vdiv.f32 s22, s26, s22
vcvt.u32.f32 d11, d11 @ quotient
@ while NEON's busy we calculate some flags on ARM
add r3, r0, #4*25
mov lr, #0 @ gteFLAG
ldmia r3, {r4-r6} @ gteMAC|123
vst1.32 d11, [r1, :64] @ wb quotient for flags (pre-limE)
vqshl.u32 d11, #15
do_mac_flags r4, r5, r6
vshr.u32 d11, #15 @ quotient (limE)
do_irs_flags r4, r5, r6
vmlal.s32 q2, d18, d11[0]@ gteOF|XY + gteIR|12 * quotient
add r3, r0, #4*13
vld1.32 d16, [r3] @ || load fS|XY12, new 01
vqmovn.s64 d18, q2 @ saturate to 32
vmull.s32 q10, d6, d11[0]@ | d20 = gteDQA * quotient
vqshl.s32 d19, d18, #5 @ 11bit precision
ldr r4, [r1] @ quotient
movs r3, r6, lsr #16
orrne lr, #(1<<31)
orrne lr, #(1<<18) @ fSZ (limD)
vst1.32 d18, [r1, :64] @ writeback fS|XY2 before limG
vshr.s32 d18, d19, #16+5@ can't vqshrn because of insn
vadd.s64 d20, d7 @ | gteDQB + gteDQA * quotient
vmovn.s32 d18, q9 @ fS|XY2 [s16]
vqmovn.s64 d20, q10 @ | gteMAC0
add r3, r0, #4*12
vst1.32 d16, [r3]! @ writeback fS|XY01
vst1.32 d18[0], [r3] @ ...2
add r3, r0, #4*24
vshr.s32 d21, d20, #12
vst1.32 d20[0], [r3] @ gteMAC0
movs r4, r4, lsr #17
orrne lr, #(1<<31)
orrne lr, #(1<<17) @ limE
vmax.s32 d21, d31
vmov.i32 d22, #0x1000
vmin.s32 d21, d22
add r3, r0, #4*8
vst1.16 d21[0], [r3] @ gteIR0
ldmia r1, {r4,r5} @ fS|XY2 before limG, after 11bit sat
add r2, r4, #0x400<<16
add r3, r5, #0x400<<16
lsrs r2, #16+11
orrne lr, #(1<<14) @ limG1
orrne lr, #(1<<31)
lsrs r3, #16+11
orrne lr, #(1<<13) @ limG2
orrne lr, #(1<<31)
adds r2, r4, #1
addvcs r3, r5, #1
orrvs lr, #(1<<16) @ F
orrvs lr, #(1<<31)
subs r2, r4, #1
subvcs r3, r5, #1
orrvs lr, #(1<<31)
ldr r4, [r0, #4*24] @ gteMAC0
orrvs lr, #(1<<15)
adds r3, r4, #1
orrvs lr, #(1<<16) @ F
orrvs lr, #(1<<31)
subs r2, r4, #1
orrvs lr, #(1<<15) @ F
orrvs lr, #(1<<31)
cmp r4, #0x1000
orrhi lr, #(1<<12) @ limH
str lr, [r0, #4*(32+31)] @ gteFLAG
pop {r4-r6,pc}
.size gteRTPS_neon, .-gteRTPS_neon
.global gteRTPT_neon @ r0=CP2 (d,c),
gteRTPT_neon:
push {r4-r11,lr}
movw r1, #:lower16:scratch
movt r1, #:upper16:scratch
mov r12, #0
rtpx_preload
vmov.i32 d22, #0x7fffffff
vmov.i32 d23, #0x80000000
mov r3, #3 @ counter
mov r2, r0 @ VXYZ(0)
0:
vldmia r2!, {d8} @ VXYZ(v)
vmov.16 d8[3], r12 @ kill unused upper vector
rtpx_mac
vmin.s32 d22, d8 @ min gteMAC|12
vmax.s32 d23, d8 @ max gteMAC|12
subs r3, #1
vst1.32 {d9,d10}, [r1, :128]!
bgt 0b
vst1.32 {d22,d23}, [r1, :128]! @ min/max gteMAC|12, for flags
@ - phase2 -
sub r1, r1, #8*2*4
vldmia r1, {d0-d3} @ note: d4,d5 is for gteOF|XY
vmov d20, d0 @ gteMAC3 v=0
vmin.s16 d24, d1, d3 @ | find min IR
vshr.s32 d22, d12, #1 @ || gteH/2 (adjust for cmp)
vmax.s16 d25, d1, d3 @ | .. also max, for flag gen
vsli.u64 d20, d2, #32 @ gteMAC3 v=1
vmov d21, d9 @ ... v=2
vmov.i32 q14, #0xffff @ 0xffff[32]
vmax.s32 q10, q15
vmov.i32 q13, #1
vdup.32 q11, d22[0] @ gteH/2
vmin.u32 q10, q14 @ saturate to 0..0xffff limD/fSZ(v)
vmin.s16 d24, d10 @ | find min/max IR
vmax.s16 d25, d10 @ |
add r3, r0, #4*19 @ ||
vld1.32 d14[0], [r3] @ || gteSZ3
vclt.u32 q11, q11, q10 @ gteH/2 < fSZ(v)?
add r3, r0, #4*17
vst1.32 d20, [r3]! @ | writeback fSZ(v)
vand q11, q10, q11
vst1.32 d21[0], [r3] @ |
vmax.u32 q10, q11, q13 @ make divisor 1 if not
add r3, r1, #8*8
vstmia r3, {q12} @ min/max IR for flags
vcvt.f32.u32 q10, q10
vshl.u32 d13, d12, #16 @ | preparing gteH
@ while NEON's busy we calculate some flags on ARM
add r2, r1, #8*2*3
mov lr, #0 @ gteFLAG
ldmia r2, {r4-r7} @ min/max gteMAC|12
subs r2, r4, #1
orrvs lr, #(1<<31)|(1<<27)
subs r3, r5, #1
orrvs lr, #(1<<31)|(1<<26)
adds r2, r6, #1
orrvs lr, #(1<<30)
adds r3, r7, #1
orrvs lr, #(1<<29)
ldr r4, [r1, #0] @ gteMAC3 v=0
ldr r5, [r1, #8*2] @ ... v=1
ldr r6, [r1, #8*4] @ ... v=2
add r3, r0, #4*(32+24)
vld1.32 d4, [r3] @ || gteOF|XY
add r3, r0, #4*(32+27)
vld1.32 d6, [r3] @ || gteDQ|AB
@ divide
.if 1
vrecpe.f32 q11, q10 @ inv
vmovl.s32 q2, d4 @ || gteOF|XY [64]
vmovl.s32 q3, d6 @ || gteDQ|AB [64]
vrecps.f32 q12, q10, q11 @ step
vcvt.f32.u32 d13, d13 @ | gteH (float for div)
vmul.f32 q11, q12, q11 @ better inv
add r3, r0, #4*16
vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
vdup.32 q13, d13[0] @ |
@ vrecps.f32 q12, q10, q11 @ step
@ vmul.f32 q11, q12, q11 @ better inv
vmul.f32 q10, q13, q11 @ result
.else
vmovl.s32 q2, d4 @ || gteOF|XY [64]
vmovl.s32 q3, d6 @ || gteDQ|AB [64]
vcvt.f32.u32 d13, d13 @ | gteH (float for div)
vdup.32 q13, d13[0] @ |
add r3, r0, #4*16
vst1.32 d14[0], [r3] @ gteSZ0 = gteSZ3
vpush {q0}
vmov q0, q10 @ to test against C code
vdiv.f32 s0, s26, s0
vdiv.f32 s1, s26, s1
vdiv.f32 s2, s26, s2
vmov q10, q0
vpop {q0}
.endif
do_mac_flags3x r4, r5, r6, (1<<31)|(1<<25), (1<<27) @ MAC3
orr r7, r4, r5
add r4, r1, #8*8
orr r3, r7, r6
ldmia r4, {r7,r8,r10,r11} @ min/max IR
movs r3, r3, lsr #16
orrne lr, #(1<<31)
orrne lr, #(1<<18) @ fSZ (limD)
@ vadd.f32 q10, q @ adjust for vcvt rounding mode
vcvt.u32.f32 q8, q10
vmovl.s16 q9, d1 @ expand gteIR|12 v=0
vmovl.s16 q10, d3 @ expand gteIR|12 v=1
add r6, r1, #8*10
vstmia r6, {q8} @ wb quotients for flags (pre-limE)
vqshl.u32 q8, #15
vmovl.s16 q11, d10 @ expand gteIR|12 v=2
vshr.u32 q8, #15 @ quotients (limE)
vdup.32 d24, d16[0]
vdup.32 d25, d16[1]
vdup.32 d26, d17[0] @ quotient (dup)
@ flags for minIR012 (r7,r8), maxIR012 (r10,r11)
mov r4, #0x10000
cmp r7, #1<<16
cmnvc r10, #1<<16
orrvs lr, #(1<<31)
orrvs lr, #(1<<23) @ IR2/limB2
rsbs r2, r4, r7, lsl #16
cmnvc r4, r10, lsl #16
orrvs lr, #(1<<31)|(1<<24) @ IR1/limB1
rsbs r2, r4, r8, lsl #16
cmnvc r4, r11, lsl #16
orrvs lr, #(1<<22) @ IR3/limB3
vmull.s32 q9, d18, d24 @ gteIR|12 * quotient v=0
vmull.s32 q10, d20, d25 @ ... v=1
vmull.s32 q11, d22, d26 @ ... v=2
vadd.s64 q9, q2 @ gteOF|XY + gteIR|12 * quotient
vadd.s64 q10, q2 @ ... v=1
vadd.s64 q11, q2 @ ... v=2
vqmovn.s64 d18, q9 @ saturate to 32 v=0
vqmovn.s64 d19, q10 @ ... v=1
vqmovn.s64 d20, q11 @ ... v=2
vmin.s32 d14, d18, d19 @ || find min/max fS|XY(v) [32]
vmax.s32 d15, d18, d19 @ || for flags
vmin.s32 d14, d20
vmax.s32 d15, d20
vqshl.s32 q11, q9, #5 @ 11bit precision, v=0,1
vqshl.s32 d24, d20, #5 @ ... v=2
vmull.s32 q13, d6, d17 @ | gteDQA * quotient v=2
vpmin.s32 d16, d14, d31 @ || also find min/max in pair
vpmax.s32 d17, d15, d31 @ ||
vshr.s32 q11, #16+5 @ can't vqshrn because of insn
vshr.s32 d24, #16+5 @ encoding doesn't allow 21 :(
vsli.u64 d16, d17, #32 @ || pack in-pair min/max
vadd.s64 d26, d7 @ | gteDQB + gteDQA * quotient
vmovn.s32 d12, q11 @ fS|XY(v) [s16] v=0,1
vmovn.s32 d13, q12 @ 3
vstmia r1, {d14-d16} @ || other cacheline than quotients
add r3, r0, #4*12
vst1.32 d12, [r3]! @ writeback fS|XY v=0,1
vst1.32 d13[0], [r3]
vqmovn.s64 d26, q13 @ | gteMAC0
vmovl.u16 q5, d10 @ expand gteIR|123 v=2
vmov.i32 d13, #0x1000
vshr.s32 d12, d26, #12
add r3, r0, #4*24
vst1.32 d26[0], [r3]! @ gteMAC0
vmax.s32 d12, d30
vst1.32 d8, [r3]! @ gteMAC123 (last iteration)
vst1.32 d9[0], [r3]
vmin.s32 d12, d13 @ | gteIR0
ldmia r6, {r4-r6} @ quotients
orr r4, r5
orr r4, r6
add r3, r0, #4*8
movs r4, r4, lsr #17
vst1.32 d12[0], [r3]! @ gteIR0
vst1.32 d10, [r3]! @ gteIR12
vst1.32 d11[0], [r3] @ ..3
@ ~23 cycles
orrne lr, #(1<<31) @ limE
orrne lr, #(1<<17) @ limE
ldmia r1, {r4-r9}
add r2, r4, #0x400<<16 @ min fSX
add r3, r6, #0x400<<16 @ max fSX
lsrs r2, #16+11
lsreqs r3, #16+11
orrne lr, #(1<<31) @ limG1
orrne lr, #(1<<14)
add r2, r5, #0x400<<16 @ min fSY
add r3, r7, #0x400<<16 @ max fSY
lsrs r2, #16+11
lsreqs r3, #16+11
orrne lr, #(1<<31) @ limG2
orrne lr, #(1<<13)
adds r2, r9, #1
orrvs lr, #(1<<16) @ F (31 already done by above)
subs r3, r8, #1
ldr r4, [r0, #4*24] @ gteMAC0
orrvs lr, #(1<<15)
adds r3, r4, #1
orrvs lr, #(1<<16)
orrvs lr, #(1<<31) @ F
subs r2, r4, #1
orrvs lr, #(1<<15)
orrvs lr, #(1<<31) @ F
cmp r4, #0x1000
orrhi lr, #(1<<12) @ limH
str lr, [r0, #4*(32+31)] @ gteFLAG
pop {r4-r11,pc}
.size gteRTPT_neon, .-gteRTPT_neon
.global gteMVMVA_neon @ r0=CP2 (d,c), op
gteMVMVA_neon:
push {r4-r5,lr}
add r12, r0, #4*32
ubfx r2, r1, #15, #2 @ v
vmov.i32 q0, #0 @ d0,d1
vmov.i32 q1, #0 @ d2,d3
vmov.i32 q2, #0 @ d4,d5
cmp r2, #3
addeq r4, r0, #4*9
addne r3, r0, r2, lsl #3
ldmeqia r4, {r3-r5}
ldmneia r3, {r4,r5}
pkhbteq r4, r3, r4, lsl #16
uxth r5, r5
vmov.32 d8[0], r4
vmov.32 d8[1], r5 @ VXYZ(v)
ubfx r3, r1, #17, #2 @ mx
ubfx r2, r1, #13, #2 @ cv
cmp r3, #3
beq 0f @ very rare case
add r3, r12, r3, lsl #5
vldmia r3, {d0-d2} @ MXxy/gteR* [16*9]
0:
cmp r2, #3
add r3, r12, r2, lsl #5
beq 0f
add r3, #4*5
vldmia r3, {d4-d5} @ CVx/gteTR*
0:
vmov.i32 q15, #0
vext.16 d2, d1, d2, #2 @ xxx3 -> x321
vext.16 d1, d0, d1, #3 @ xx32 -> x321
vshll.s32 q3, d5, #12 @ gteTRZ/CV3
vshll.s32 q2, d4, #12 @ gteTR|XY/CV12
vmull.s16 q8, d0, d8
vmull.s16 q9, d1, d8
vmull.s16 q10, d2, d8
vpadd.s32 d16, d16, d17
vpadd.s32 d17, d18, d19
vpadd.s32 d18, d20, d21
vpadal.s32 q2, q8
vpadal.s32 q3, q9
tst r1, #1<<19
beq 0f
vshr.s64 q2, q2, #12
vshr.s64 q3, q3, #12
0:
vqmovn.s64 d8, q2 @ gteMAC|12
vqmovn.s64 d9, q3 @ gteMAC3
tst r1, #1<<10
add r3, r0, #4*25
vqmovn.s32 d10, q4 @ gteIR|123
vst1.32 d8, [r3]!
vst1.32 d9[0], [r3] @ wb gteMAC|123
beq 0f
vmax.s16 d10, d31
0:
vmovl.s16 q9, d10 @ expand gteIR|123
add r3, r0, #4*9
vst1.32 d18, [r3]!
vst1.32 d19[0], [r3]
tst r1, #1<<10 @ lm
mov r2, #0
mov lr, #0 @ gteFLAG
mov r12, #15
moveq r2, #0x8000 @ adj
moveq r12, #16 @ shift
add r3, r0, #4*25
ldmia r3, {r3-r5} @ gteMAC|123
do_mac_flags r3, r4, r5
add r3, r2
add r4, r2
add r5, r2
asrs r3, r12
orrne lr, #(1<<31)|(1<<24) @ IR1/limB1
asrs r4, r12
orrne lr, #(1<<31)
orrne lr, #(1<<23) @ IR2/limB2
asrs r5, r12
orrne lr, #(1<<22) @ IR3/limB3
str lr, [r0, #4*(32+31)] @ gteFLAG
pop {r4-r5,pc}
.size gteMVMVA_neon, .-gteMVMVA_neon
@ vim:filetype=armasm
|