arm/video_blend.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181

.align 2

.global expand_blend
.global expand_normal

@ Input:
@   r0 = screen_src_ptr
@   r1 = screen_dest_ptr
@   r2 = start
@   r3 = end

6:
  .word io_registers
  .word palette_ram_converted
  .word 0x04000200                @ combine test mask
  .word 0x07E0F81F                @ clamp mask
  .word 0x000003FE                @ palette index mask
  .word 0x08010020                @ saturation mask

expand_blend:
  stmdb sp!, { r4, r5, r6, r9, r10, r11, r14 }

  add r0, r0, r2, lsl #2          @ screen_src_ptr += start
  add r1, r1, r2, lsl #1          @ screen_dest_ptr += start
  sub r2, r3, r2                  @ r2 = end - start
  ldr r3, 6b                      @ r3 = io_registers
  ldrh r3, [r3, #0x52]            @ r3 = bldalpha
  mov r4, r3, lsr #8              @ r4 = bldalpha >> 8
  and r3, r3, #0x1F               @ r3 = blend_a
  and r4, r4, #0x1F               @ r4 = blend_b
  cmp r3, #16                     @ if(blend_a > 16)
  movgt r3, #16                   @   blend_a = 16
  cmp r4, #16                     @ if(blend_b > 16)
  movgt r4, #16                   @   blend_b = 16

  ldr r14, 6b + 4                 @ r14 = palette_ram_converted
  ldr r12, 6b + 8                 @ r12 = 0x04000200
  ldr r11, 6b + 12                @ r11 = 0x07E0F81F
  ldr r10, 6b + 16                @ r10 = 0x000003FE

  add r5, r3, r4                  @ r5 = blend_a + blend_b
  cmp r5, #16                     @ if((blend_a + blend_b) > 16)
  bgt 3f                          @   goto loop w/saturation


  @ loop w/o saturation
1:
  ldr r5, [r0], #4                @ r5 = pixel_pair, screen_src_ptr++
  and r6, r5, r12                 @ r6 = r5 & 0x04000200
  cmp r6, r12                     @ if(r6 != 0x4000200)
  bne 2f                          @   goto no_blend

  and r6, r10, r5, lsl #1         @ r6 = (pixel_pair & 0x1FF) << 1
  ldrh r6, [r14, r6]              @ r6 = pixel_top
  orr r6, r6, r6, lsl #16         @ r6 = pixel_top | (pixel_top << 16)
  and r6, r6, r11                 @ r6 = pixel_top_dilated

  and r5, r10, r5, lsr #15        @ r5 = ((pixel_pair >> 16) & 0x1FF) << 1
  ldrh r5, [r14, r5]              @ r5 = pixel_bottom
  orr r5, r5, r5, lsl #16         @ r5 = pixel_bottom | (pixel_bottom << 16)
  and r5, r5, r11                 @ r5 = pixel_bottom_dilated

  mul r5, r4, r5                  @ r5 = pixel_bottom * blend_b = bottom_mul
  mla r5, r3, r6, r5              @ r5 = (pixel_top * blend_a) + bottom_mul

  and r5, r11, r5, lsr #4         @ r5 = (color_dilated >> 4) & 0x07E0F81F
  orr r5, r5, r5, lsr #16         @ r5 = color_dilated | (color_dilated >> 16)

  strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++
  subs r2, r2, #1                 @ counter--
  bne 1b                          @ go again

  ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }

2:
  and r5, r10, r5, lsl #1         @ r5 = (pixel_pair & 0x1FF) << 1
  ldrh r5, [r14, r5]              @ r5 = pixel_top
  strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++

  subs r2, r2, #1                 @ counter--
  bne 1b                          @ go again

  ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }

@ loop w/saturation

3:
  ldr r9, 6b + 20                 @ r9 = 0x08010020

4:
  ldr r5, [r0], #4                @ r5 = pixel_pair, screen_src_ptr++
  and r6, r5, r12                 @ r6 = r5 & 0x04000200
  cmp r6, r12                     @ if(r6 != 0x4000200)
  bne 5f                          @   goto no_blend

  and r6, r10, r5, lsl #1         @ r6 = (pixel_pair & 0x1FF) << 1
  ldrh r6, [r14, r6]              @ r6 = pixel_top
  orr r6, r6, r6, lsl #16         @ r6 = pixel_top | (pixel_top << 16)
  and r6, r6, r11                 @ r6 = pixel_top_dilated

  and r5, r10, r5, lsr #15        @ r5 = ((pixel_pair >> 16) & 0x1FF) << 1
  ldrh r5, [r14, r5]              @ r5 = pixel_bottom
  orr r5, r5, r5, lsl #16         @ r5 = pixel_bottom | (pixel_bottom << 16)
  and r5, r5, r11                 @ r5 = pixel_bottom_dilated

  mul r5, r4, r5                  @ r5 = pixel_bottom * blend_b = bottom_mul
  mla r5, r3, r6, r5              @ r5 = (pixel_top * blend_a) + bottom_mul

  and r6, r9, r5, lsr #4          @ r6 = saturation bits
  orr r6, r6, r6, lsr #1          @ propogate saturation down msb
  orr r6, r6, r6, lsr #2          @ propogate down next two bits
  orr r6, r6, r6, lsr #3          @ propogate down next three bits
  orr r5, r6, r5, lsr #4          @ mask over result w/saturation

  and r5, r11, r5                 @ r5 = (color_dilated >> 4) & 0x07E0F81F
  orr r5, r5, r5, lsr #16         @ r5 = color_dilated | (color_dilated >> 16)
  strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++

  subs r2, r2, #1                 @ counter--
  bne 4b                          @ go again

  ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }

5:
  and r5, r10, r5, lsl #1         @ r5 = (pixel_pair & 0x1FF) << 1
  ldrh r5, [r14, r5]              @ r5 = pixel_top
  strh r5, [r1], #2               @ *screen_dest_ptr = r5, screen_dest_ptr++

  subs r2, r2, #1                 @ counter--
  bne 4b                          @ go again

  ldmia sp!, { r4, r5, r6, r9, r10, r11, pc }


@ The following function isn't complete (only works on run multiples of 8),
@ but unfortunately I don't see much potential for actually being able to
@ use it..

#define expand_pixel_pair(reg, temp)                                         ;\
  and temp, r3, reg, lsr #15                                                 ;\
  ldrh temp, [r2, temp]                                                      ;\
                                                                             ;\
  and reg, r3, reg, lsl #1                                                   ;\
  ldrh reg, [r2, reg]                                                        ;\
                                                                             ;\
  orr reg, reg, temp, lsl #16                                                ;\


@ Input:
@   r0 = screen_ptr
@   r1 = start
@   r2 = end

1:
  .word palette_ram_converted
  .word 0x3FE

expand_normal:
  stmdb sp!, { r4, r5, r6, r7, r14 }

  add r0, r0, r1, lsl #1          @ screen_ptr += start
  sub r1, r2, r1                  @ r1 = end - start
  ldr r2, 1b                      @ r2 = palette_ram_converted
  ldr r3, 1b + 4                  @ r3 = 0x3FE

2:
  ldmia r0, { r4, r5, r6, r7 }

  expand_pixel_pair(r4, r14)
  expand_pixel_pair(r5, r14)
  expand_pixel_pair(r6, r14)
  expand_pixel_pair(r7, r14)

  stmia r0!, { r4, r5, r6, r7 }

  subs r1, r1, #8
  bne 2b

  ldmia sp!, { r4, r5, r6, r7, pc }