plugins/gpulib/gpulib_thread_if.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497

/**************************************************************************
*   Copyright (C) 2020 The RetroArch Team                                 *
*                                                                         *
*   This program is free software; you can redistribute it and/or modify  *
*   it under the terms of the GNU General Public License as published by  *
*   the Free Software Foundation; either version 2 of the License, or     *
*   (at your option) any later version.                                   *
*                                                                         *
*   This program is distributed in the hope that it will be useful,       *
*   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
*   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
*   GNU General Public License for more details.                          *
*                                                                         *
*   You should have received a copy of the GNU General Public License     *
*   along with this program; if not, write to the                         *
*   Free Software Foundation, Inc.,                                       *
*   51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA.           *
***************************************************************************/

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include "../gpulib/gpu.h"
#include "../../frontend/plugin_lib.h"
#include "gpulib_thread_if.h"

#define FALSE 0
#define TRUE 1
#define BOOL unsigned short

typedef struct {
	uint32_t *cmd_list;
	int count;
	int last_cmd;
} video_thread_cmd;

#define QUEUE_SIZE 0x2000

typedef struct {
	size_t start;
	size_t end;
	size_t used;
	video_thread_cmd queue[QUEUE_SIZE];
} video_thread_queue;

typedef struct {
	pthread_t thread;
	pthread_mutex_t queue_lock;
	pthread_cond_t cond_msg_avail;
	pthread_cond_t cond_msg_done;
	pthread_cond_t cond_queue_empty;
	video_thread_queue *queue;
	video_thread_queue *bg_queue;
	BOOL running;
} video_thread_state;

static video_thread_state thread;
static video_thread_queue queues[2];
static int thread_rendering;
static BOOL hold_cmds;
static BOOL needs_display;
static BOOL flushed;

extern const unsigned char cmd_lengths[];

static void *video_thread_main(void *arg) {
	video_thread_state *thread = (video_thread_state *)arg;
	video_thread_cmd *cmd;
	int i;

#ifdef _3DS
	static int processed = 0;
#endif /* _3DS */

	while(1) {
		int result, last_cmd, start, end;
		video_thread_queue *queue;
		pthread_mutex_lock(&thread->queue_lock);

		while (!thread->queue->used && thread->running) {
			pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock);
		}

		if (!thread->running) {
			pthread_mutex_unlock(&thread->queue_lock);
			break;
		}

		queue = thread->queue;
		start = queue->start;
		end = queue->end > queue->start ? queue->end : QUEUE_SIZE;
		queue->start = end % QUEUE_SIZE;
		pthread_mutex_unlock(&thread->queue_lock);

		for (i = start; i < end; i++) {
			cmd = &queue->queue[i];
			result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd);

			if (result != cmd->count) {
				fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result);
			}

#ifdef _3DS
			/* Periodically yield so as not to starve other threads */
			processed += cmd->count;
			if (processed >= 512) {
				svcSleepThread(1);
				processed %= 512;
			}
#endif /* _3DS */
		}

		pthread_mutex_lock(&thread->queue_lock);
		queue->used -= (end - start);

		if (!queue->used)
			pthread_cond_signal(&thread->cond_queue_empty);

		pthread_cond_signal(&thread->cond_msg_done);
		pthread_mutex_unlock(&thread->queue_lock);
	}

	return 0;
}

static void cmd_queue_swap() {
	video_thread_queue *tmp;
	if (!thread.bg_queue->used) return;

	pthread_mutex_lock(&thread.queue_lock);
	if (!thread.queue->used) {
		tmp = thread.queue;
		thread.queue = thread.bg_queue;
		thread.bg_queue = tmp;
		pthread_cond_signal(&thread.cond_msg_avail);
	}
	pthread_mutex_unlock(&thread.queue_lock);
}

/* Waits for the main queue to completely finish. */
void renderer_wait() {
	if (!thread.running) return;

	/* Not completely safe, but should be fine since the render thread
	 * only decreases used, and we check again inside the lock. */
	if (!thread.queue->used) {
		return;
	}

	pthread_mutex_lock(&thread.queue_lock);

	while (thread.queue->used) {
		pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock);
	}

	pthread_mutex_unlock(&thread.queue_lock);
}

/* Waits for all GPU commands in both queues to finish, bringing VRAM
 * completely up-to-date. */
void renderer_sync(void) {
	if (!thread.running) return;

	/* Not completely safe, but should be fine since the render thread
	 * only decreases used, and we check again inside the lock. */
	if (!thread.queue->used && !thread.bg_queue->used) {
		return;
	}

	if (thread.bg_queue->used) {
		/* When we flush the background queue, the vblank handler can't
		 * know that we had a frame pending, and we delay rendering too
		 * long. Force it. */
		flushed = TRUE;
	}

	/* Flush both queues. This is necessary because gpulib could be
	 * trying to process a DMA write that a command in the queue should
	 * run beforehand. For example, Xenogears sprites write a black
	 * rectangle over the to-be-DMA'd spot in VRAM -- if this write
	 * happens after the DMA, it will clear the DMA, resulting in
	 * flickering sprites. We need to be totally up-to-date. This may
	 * drop a frame. */
	renderer_wait();
	cmd_queue_swap();
	hold_cmds = FALSE;
	renderer_wait();
}

static void video_thread_stop() {
	int i;
	renderer_sync();

	if (thread.running) {
		thread.running = FALSE;
		pthread_cond_signal(&thread.cond_msg_avail);
		pthread_join(thread.thread, NULL);
	}

	pthread_mutex_destroy(&thread.queue_lock);
	pthread_cond_destroy(&thread.cond_msg_avail);
	pthread_cond_destroy(&thread.cond_msg_done);
	pthread_cond_destroy(&thread.cond_queue_empty);

	for (i = 0; i < QUEUE_SIZE; i++) {
		video_thread_cmd *cmd = &thread.queue->queue[i];
		free(cmd->cmd_list);
		cmd->cmd_list = NULL;
	}

	for (i = 0; i < QUEUE_SIZE; i++) {
		video_thread_cmd *cmd = &thread.bg_queue->queue[i];
		free(cmd->cmd_list);
		cmd->cmd_list = NULL;
	}
}

static void video_thread_start() {
	fprintf(stdout, "Starting render thread\n");

	if (pthread_cond_init(&thread.cond_msg_avail, NULL) ||
			pthread_cond_init(&thread.cond_msg_done, NULL) ||
			pthread_cond_init(&thread.cond_queue_empty, NULL) ||
			pthread_mutex_init(&thread.queue_lock, NULL) ||
			pthread_create(&thread.thread, NULL, video_thread_main, &thread)) {
		goto error;
	}

	thread.queue = &queues[0];
	thread.bg_queue = &queues[1];

	thread.running = TRUE;
	return;

 error:
	fprintf(stderr,"Failed to start rendering thread\n");
	video_thread_stop();
}

static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) {
	video_thread_cmd *cmd;
	uint32_t *cmd_list;
	video_thread_queue *queue;
	BOOL lock;

	cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t));

	if (!cmd_list) {
		/* Out of memory, disable the thread and run sync from now on */
		fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n");
		video_thread_stop();
	}

	memcpy(cmd_list, list, count * sizeof(uint32_t));

	if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) {
		/* If the bg queue is full, do a full sync to empty both queues
		 * and clear space. This should be very rare, I've only seen it in
		 * Tekken 3 post-battle-replay. */
		renderer_sync();
	}

	if (hold_cmds) {
		queue = thread.bg_queue;
		lock = FALSE;
	} else {
		queue = thread.queue;
		lock = TRUE;
	}

	if (lock) {
		pthread_mutex_lock(&thread.queue_lock);

		while (queue->used >= QUEUE_SIZE) {
			pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock);
		}
	}

	cmd = &queue->queue[queue->end];
	free(cmd->cmd_list);
	cmd->cmd_list = cmd_list;
	cmd->count = count;
	cmd->last_cmd = last_cmd;
	queue->end = (queue->end + 1) % QUEUE_SIZE;
	queue->used++;

	if (lock) {
		pthread_cond_signal(&thread.cond_msg_avail);
		pthread_mutex_unlock(&thread.queue_lock);
	}
}

/* Slice off just the part of the list that can be handled async, and
 * update ex_regs. */
static int scan_cmd_list(uint32_t *data, int count, int *last_cmd)
{
	int cmd = 0, pos = 0, len, v;

	while (pos < count) {
		uint32_t *list = data + pos;
		cmd = list[0] >> 24;
		len = 1 + cmd_lengths[cmd];

		switch (cmd) {
			case 0x02:
				break;
			case 0x24 ... 0x27:
			case 0x2c ... 0x2f:
			case 0x34 ... 0x37:
			case 0x3c ... 0x3f:
				gpu.ex_regs[1] &= ~0x1ff;
				gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff;
				break;
			case 0x48 ... 0x4F:
				for (v = 3; pos + v < count; v++)
				{
					if ((list[v] & 0xf000f000) == 0x50005000)
						break;
				}
				len += v - 3;
				break;
			case 0x58 ... 0x5F:
				for (v = 4; pos + v < count; v += 2)
				{
					if ((list[v] & 0xf000f000) == 0x50005000)
						break;
				}
				len += v - 4;
				break;
			default:
				if ((cmd & 0xf8) == 0xe0)
					gpu.ex_regs[cmd & 7] = list[0];
				break;
		}

		if (pos + len > count) {
			cmd = -1;
			break; /* incomplete cmd */
		}
		if (0xa0 <= cmd && cmd <= 0xdf)
			break; /* image i/o */

		pos += len;
	}

	*last_cmd = cmd;
	return pos;
}

int do_cmd_list(uint32_t *list, int count, int *last_cmd) {
	int pos = 0;

	if (thread.running) {
		pos = scan_cmd_list(list, count, last_cmd);
		video_thread_queue_cmd(list, pos, *last_cmd);
	} else {
		pos = real_do_cmd_list(list, count, last_cmd);
		memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs));
	}
	return pos;
}

int renderer_init(void) {
	if (thread_rendering) {
		video_thread_start();
	}
	return real_renderer_init();
}

void renderer_finish(void) {
	real_renderer_finish();

	if (thread_rendering && thread.running) {
		video_thread_stop();
	}
}

void renderer_sync_ecmds(uint32_t * ecmds) {
	if (thread.running) {
		int dummy;
		do_cmd_list(&ecmds[1], 6, &dummy);
	} else {
		real_renderer_sync_ecmds(ecmds);
	}
}

void renderer_update_caches(int x, int y, int w, int h) {
	renderer_sync();
	real_renderer_update_caches(x, y, w, h);
}

void renderer_flush_queues(void) {
	/* Called during DMA and updateLace. We want to sync if it's DMA,
	 * but not if it's updateLace. Instead of syncing here, there's a
	 * renderer_sync call during DMA. */
	real_renderer_flush_queues();
}

/*
 * Normally all GPU commands are processed before rendering the
 * frame. For games that naturally run < 50/60fps, this is unnecessary
 * -- it forces the game to render as if it was 60fps and leaves the
 * GPU idle half the time on a 30fps game, for example.
 *
 * Allowing the renderer to wait until a frame is done before
 * rendering it would give it double, triple, or quadruple the amount
 * of time to finish before we have to wait for it.
 *
 * We can use a heuristic to figure out when to force a render.
 *
 * - If a frame isn't done when we're asked to render, wait for it and
 *   put future GPU commands in a separate buffer (for the next frame)
 *
 * - If the frame is done, and had no future GPU commands, render it.
 *
 * - If we do have future GPU commands, it meant the frame took too
 *   long to render and there's another frame waiting. Stop until the
 *   first frame finishes, render it, and start processing the next
 *   one.
 *
 * This may possibly add a frame or two of latency that shouldn't be
 * different than the real device. It may skip rendering a frame
 * entirely if a VRAM transfer happens while a frame is waiting, or in
 * games that natively run at 60fps if frames are coming in too
 * quickly to process. Depending on how the game treats "60fps," this
 * may not be noticeable.
 */
void renderer_notify_update_lace(int updated) {
	if (!thread.running) return;

	if (thread_rendering == THREAD_RENDERING_SYNC) {
		renderer_sync();
		return;
	}

	if (updated) {
		cmd_queue_swap();
		return;
	}

	pthread_mutex_lock(&thread.queue_lock);
	if (thread.bg_queue->used || flushed) {
		/* We have commands for a future frame to run. Force a wait until
		 * the current frame is finished, and start processing the next
		 * frame after it's drawn (see the `updated` clause above). */
		pthread_mutex_unlock(&thread.queue_lock);
		renderer_wait();
		pthread_mutex_lock(&thread.queue_lock);

		/* We are no longer holding commands back, so the next frame may
		 * get mixed into the following frame. This is usually fine, but can
		 * result in frameskip-like effects for 60fps games. */
		flushed = FALSE;
		hold_cmds = FALSE;
		needs_display = TRUE;
		gpu.state.fb_dirty = TRUE;
	} else if (thread.queue->used) {
		/* We are still drawing during a vblank. Cut off the current frame
		 * by sending new commands to the background queue and skip
		 * drawing our partly rendered frame to the display. */
		hold_cmds = TRUE;
		needs_display = TRUE;
		gpu.state.fb_dirty = FALSE;
	} else if (needs_display && !thread.queue->used) {
		/* We have processed all commands in the queue, render the
		 * buffer. We know we have something to render, because
		 * needs_display is TRUE. */
		hold_cmds = FALSE;
		needs_display = FALSE;
		gpu.state.fb_dirty = TRUE;
	} else {
		/* Everything went normally, so do the normal thing. */
	}

	pthread_mutex_unlock(&thread.queue_lock);
}

void renderer_set_interlace(int enable, int is_odd) {
	real_renderer_set_interlace(enable, is_odd);
}

void renderer_set_config(const struct rearmed_cbs *cbs) {
	renderer_sync();
	thread_rendering = cbs->thread_rendering;
	if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) {
		video_thread_start();
	} else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) {
		video_thread_stop();
	}
	real_renderer_set_config(cbs);
}

void renderer_notify_res_change(void) {
	renderer_sync();
	real_renderer_notify_res_change();
}