From c765eb86debdc06fe304511bc2edbb6f3e3d7813 Mon Sep 17 00:00:00 2001 From: Justin Weiss Date: Sun, 13 Sep 2020 20:06:57 -0700 Subject: Add a threaded renderer This change adds a gpulib implementation that accepts GPU commands and runs them through a real gpulib implementation on a thread. Depending on a setting, it can either force a sync every frame, or continue to work until the next frame arrives. --- plugins/dfxvideo/gpulib_if.c | 8 + plugins/gpu-gles/gpulib_if.c | 8 + plugins/gpu_neon/psx_gpu_if.c | 6 + plugins/gpu_unai/gpulib_if.cpp | 25 ++ plugins/gpulib/gpu.c | 11 +- plugins/gpulib/gpu.h | 3 + plugins/gpulib/gpulib_thread_if.c | 481 ++++++++++++++++++++++++++++++++++++++ plugins/gpulib/gpulib_thread_if.h | 41 ++++ 8 files changed, 582 insertions(+), 1 deletion(-) create mode 100644 plugins/gpulib/gpulib_thread_if.c create mode 100644 plugins/gpulib/gpulib_thread_if.h (limited to 'plugins') diff --git a/plugins/dfxvideo/gpulib_if.c b/plugins/dfxvideo/gpulib_if.c index bb3ad56..db0797c 100644 --- a/plugins/dfxvideo/gpulib_if.c +++ b/plugins/dfxvideo/gpulib_if.c @@ -426,6 +426,14 @@ void renderer_set_interlace(int enable, int is_odd) { } +void renderer_sync(void) +{ +} + +void renderer_notify_update_lace(int updated) +{ +} + #include "../../frontend/plugin_lib.h" void renderer_set_config(const struct rearmed_cbs *cbs) diff --git a/plugins/gpu-gles/gpulib_if.c b/plugins/gpu-gles/gpulib_if.c index 1f4a23d..8cc1469 100644 --- a/plugins/gpu-gles/gpulib_if.c +++ b/plugins/gpu-gles/gpulib_if.c @@ -769,3 +769,11 @@ static void fps_update(void) DisplayText(buf, 1); } } + +void renderer_sync(void) +{ +} + +void renderer_notify_update_lace(int updated) +{ +} diff --git a/plugins/gpu_neon/psx_gpu_if.c b/plugins/gpu_neon/psx_gpu_if.c index 3f3805a..81b9bae 100644 --- a/plugins/gpu_neon/psx_gpu_if.c +++ b/plugins/gpu_neon/psx_gpu_if.c @@ -204,3 +204,9 @@ void renderer_set_config(const struct rearmed_cbs *cbs) } } +void renderer_sync(void) +{ +} +void renderer_notify_update_lace(int updated) +{ +} diff --git a/plugins/gpu_unai/gpulib_if.cpp b/plugins/gpu_unai/gpulib_if.cpp index e84eff5..588134d 100644 --- a/plugins/gpu_unai/gpulib_if.cpp +++ b/plugins/gpu_unai/gpulib_if.cpp @@ -25,6 +25,23 @@ #include #include #include "../gpulib/gpu.h" + +#ifdef THREAD_RENDERING +#include "../gpulib/gpulib_thread_if.h" +#define do_cmd_list real_do_cmd_list +#define renderer_init real_renderer_init +#define renderer_finish real_renderer_finish +#define renderer_sync_ecmds real_renderer_sync_ecmds +#define renderer_update_caches real_renderer_update_caches +#define renderer_flush_queues real_renderer_flush_queues +#define renderer_set_interlace real_renderer_set_interlace +#define renderer_set_config real_renderer_set_config +#define renderer_notify_res_change real_renderer_notify_res_change +#define renderer_notify_update_lace real_renderer_notify_update_lace +#define renderer_sync real_renderer_sync +#define ex_regs scratch_ex_regs +#endif + //#include "port.h" #include "gpu_unai.h" @@ -802,4 +819,12 @@ void renderer_set_config(const struct rearmed_cbs *cbs) } } +void renderer_sync(void) +{ +} + +void renderer_notify_update_lace(int updated) +{ +} + // vim:shiftwidth=2:expandtab diff --git a/plugins/gpulib/gpu.c b/plugins/gpulib/gpu.c index 007da65..ed37b71 100644 --- a/plugins/gpulib/gpu.c +++ b/plugins/gpulib/gpu.c @@ -40,6 +40,8 @@ static void finish_vram_transfer(int is_read); static noinline void do_cmd_reset(void) { + renderer_sync(); + if (unlikely(gpu.cmd_len > 0)) do_cmd_buffer(gpu.cmd_buffer, gpu.cmd_len); gpu.cmd_len = 0; @@ -52,7 +54,6 @@ static noinline void do_cmd_reset(void) static noinline void do_reset(void) { unsigned int i; - do_cmd_reset(); memset(gpu.regs, 0, sizeof(gpu.regs)); @@ -370,6 +371,8 @@ static int do_vram_io(uint32_t *data, int count, int is_read) int l; count *= 2; // operate in 16bpp pixels + renderer_sync(); + if (gpu.dma.offset) { l = w - gpu.dma.offset; if (count < l) @@ -714,12 +717,15 @@ long GPUfreeze(uint32_t type, struct GPUFreeze *freeze) case 1: // save if (gpu.cmd_len > 0) flush_cmd_buffer(); + + renderer_sync(); memcpy(freeze->psxVRam, gpu.vram, 1024 * 512 * 2); memcpy(freeze->ulControl, gpu.regs, sizeof(gpu.regs)); memcpy(freeze->ulControl + 0xe0, gpu.ex_regs, sizeof(gpu.ex_regs)); freeze->ulStatus = gpu.status.reg; break; case 0: // load + renderer_sync(); memcpy(gpu.vram, freeze->psxVRam, 1024 * 512 * 2); memcpy(gpu.regs, freeze->ulControl, sizeof(gpu.regs)); memcpy(gpu.ex_regs, freeze->ulControl + 0xe0, sizeof(gpu.ex_regs)); @@ -752,6 +758,8 @@ void GPUupdateLace(void) return; } + renderer_notify_update_lace(0); + if (!gpu.state.fb_dirty) return; @@ -767,6 +775,7 @@ void GPUupdateLace(void) vout_update(); gpu.state.fb_dirty = 0; gpu.state.blanked = 0; + renderer_notify_update_lace(1); } void GPUvBlank(int is_vblank, int lcf) diff --git a/plugins/gpulib/gpu.h b/plugins/gpulib/gpu.h index d0f3bf8..64d2eec 100644 --- a/plugins/gpulib/gpu.h +++ b/plugins/gpulib/gpu.h @@ -93,6 +93,7 @@ struct psx_gpu { uint32_t last_flip_frame; uint32_t pending_fill[3]; } frameskip; + uint32_t scratch_ex_regs[8]; // for threaded rendering int useDithering:1; /* 0 - off , 1 - on */ uint16_t *(*get_enhancement_bufer) (int *x, int *y, int *w, int *h, int *vram_h); @@ -118,6 +119,8 @@ void renderer_flush_queues(void); void renderer_set_interlace(int enable, int is_odd); void renderer_set_config(const struct rearmed_cbs *config); void renderer_notify_res_change(void); +void renderer_notify_update_lace(int updated); +void renderer_sync(void); int vout_init(void); int vout_finish(void); diff --git a/plugins/gpulib/gpulib_thread_if.c b/plugins/gpulib/gpulib_thread_if.c new file mode 100644 index 0000000..f0f607d --- /dev/null +++ b/plugins/gpulib/gpulib_thread_if.c @@ -0,0 +1,481 @@ +/************************************************************************** +* Copyright (C) 2020 The RetroArch Team * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * +***************************************************************************/ + +#include +#include +#include +#include "../gpulib/gpu.h" +#include "../../frontend/plugin_lib.h" +#include "gpulib_thread_if.h" + +typedef struct { + uint32_t *cmd_list; + int count; + int last_cmd; +} video_thread_cmd; + +#define QUEUE_SIZE 0x2000 + +typedef struct { + size_t start; + size_t end; + size_t used; + video_thread_cmd queue[QUEUE_SIZE]; +} video_thread_queue; + +typedef struct { + pthread_t thread; + pthread_mutex_t queue_lock; + pthread_cond_t cond_msg_avail; + pthread_cond_t cond_msg_done; + pthread_cond_t cond_queue_empty; + video_thread_queue *queue; + video_thread_queue *bg_queue; + bool running; +} video_thread_state; + +static video_thread_state thread; +static video_thread_queue queues[2]; +static int thread_rendering; +static bool hold_cmds; +static bool needs_display; + +extern const unsigned char cmd_lengths[]; + +static void *video_thread_main(void *arg) { + video_thread_state *thread = (video_thread_state *)arg; + video_thread_cmd *cmd; + int i; + static int processed = 0; + + while(1) { + int result, last_cmd, start, end; + video_thread_queue *queue; + pthread_mutex_lock(&thread->queue_lock); + + while (!thread->queue->used && thread->running) { + pthread_cond_wait(&thread->cond_msg_avail, &thread->queue_lock); + } + + if (!thread->running) { + pthread_mutex_unlock(&thread->queue_lock); + break; + } + + queue = thread->queue; + start = queue->start; + end = queue->end > queue->start ? queue->end : QUEUE_SIZE; + queue->start = end % QUEUE_SIZE; + pthread_mutex_unlock(&thread->queue_lock); + + for (i = start; i < end; i++) { + cmd = &queue->queue[i]; + result = real_do_cmd_list(cmd->cmd_list, cmd->count, &last_cmd); + + if (result != cmd->count) { + fprintf(stderr, "Processed wrong cmd count: expected %d, got %d\n", cmd->count, result); + } + +#ifdef _3DS + /* Periodically yield so as not to starve other threads */ + processed += cmd->count; + if (processed >= 512) { + svcSleepThread(1); + processed %= 512; + } +#endif + } + + pthread_mutex_lock(&thread->queue_lock); + queue->used -= (end - start); + + if (!queue->used) + pthread_cond_signal(&thread->cond_queue_empty); + + pthread_cond_signal(&thread->cond_msg_done); + pthread_mutex_unlock(&thread->queue_lock); + } + + return 0; +} + +static void cmd_queue_swap() { + video_thread_queue *tmp; + if (!thread.bg_queue->used) return; + + pthread_mutex_lock(&thread.queue_lock); + if (!thread.queue->used) { + tmp = thread.queue; + thread.queue = thread.bg_queue; + thread.bg_queue = tmp; + needs_display = true; + pthread_cond_signal(&thread.cond_msg_avail); + } + pthread_mutex_unlock(&thread.queue_lock); +} + +/* Waits for the main queue to completely finish. */ +void renderer_wait() { + if (!thread.running) return; + + /* Not completely safe, but should be fine since the render thread + * only decreases used, and we check again inside the lock. */ + if (!thread.queue->used) { + return; + } + + pthread_mutex_lock(&thread.queue_lock); + + while (thread.queue->used) { + pthread_cond_wait(&thread.cond_queue_empty, &thread.queue_lock); + } + + pthread_mutex_unlock(&thread.queue_lock); +} + +/* Waits for all GPU commands in both queues to finish, bringing VRAM + * completely up-to-date. */ +void renderer_sync(void) { + if (!thread.running) return; + + /* Not completely safe, but should be fine since the render thread + * only decreases used, and we check again inside the lock. */ + if (!thread.queue->used && !thread.bg_queue->used) { + return; + } + + /* Flush both queues. This is necessary because gpulib could be + * trying to process a DMA write that a command in the queue should + * run beforehand. For example, Xenogears sprites write a black + * rectangle over the to-be-DMA'd spot in VRAM -- if this write + * happens after the DMA, it will clear the DMA, resulting in + * flickering sprites. We need to be totally up-to-date. This may + * drop a frame. */ + renderer_wait(); + cmd_queue_swap(); + hold_cmds = false; + renderer_wait(); +} + +static void video_thread_stop() { + int i; + renderer_sync(); + + if (thread.running) { + thread.running = false; + pthread_cond_signal(&thread.cond_msg_avail); + pthread_join(thread.thread, NULL); + } + + pthread_mutex_destroy(&thread.queue_lock); + pthread_cond_destroy(&thread.cond_msg_avail); + pthread_cond_destroy(&thread.cond_msg_done); + pthread_cond_destroy(&thread.cond_queue_empty); + + for (i = 0; i < QUEUE_SIZE; i++) { + video_thread_cmd *cmd = &thread.queue->queue[i]; + free(cmd->cmd_list); + cmd->cmd_list = NULL; + } + + for (i = 0; i < QUEUE_SIZE; i++) { + video_thread_cmd *cmd = &thread.bg_queue->queue[i]; + free(cmd->cmd_list); + cmd->cmd_list = NULL; + } +} + +static void video_thread_start() { + fprintf(stdout, "Starting render thread\n"); + + if (pthread_cond_init(&thread.cond_msg_avail, NULL) || + pthread_cond_init(&thread.cond_msg_done, NULL) || + pthread_cond_init(&thread.cond_queue_empty, NULL) || + pthread_mutex_init(&thread.queue_lock, NULL) || + pthread_create(&thread.thread, NULL, video_thread_main, &thread)) { + goto error; + } + + thread.queue = &queues[0]; + thread.bg_queue = &queues[1]; + + thread.running = true; + return; + + error: + fprintf(stderr,"Failed to start rendering thread\n"); + video_thread_stop(); +} + +static void video_thread_queue_cmd(uint32_t *list, int count, int last_cmd) { + video_thread_cmd *cmd; + uint32_t *cmd_list; + video_thread_queue *queue; + bool lock; + + cmd_list = (uint32_t *)calloc(count, sizeof(uint32_t)); + + if (!cmd_list) { + /* Out of memory, disable the thread and run sync from now on */ + fprintf(stderr,"Failed to allocate render thread command list, stopping thread\n"); + video_thread_stop(); + } + + memcpy(cmd_list, list, count * sizeof(uint32_t)); + + if (hold_cmds && thread.bg_queue->used >= QUEUE_SIZE) { + /* If the bg queue is full, do a full sync to empty both queues + * and clear space. This should be very rare, I've only seen it in + * Tekken 3 post-battle-replay. */ + renderer_sync(); + } + + if (hold_cmds) { + queue = thread.bg_queue; + lock = false; + } else { + queue = thread.queue; + lock = true; + } + + if (lock) { + pthread_mutex_lock(&thread.queue_lock); + + while (queue->used >= QUEUE_SIZE) { + pthread_cond_wait(&thread.cond_msg_done, &thread.queue_lock); + } + } + + cmd = &queue->queue[queue->end]; + free(cmd->cmd_list); + cmd->cmd_list = cmd_list; + cmd->count = count; + cmd->last_cmd = last_cmd; + queue->end = (queue->end + 1) % QUEUE_SIZE; + queue->used++; + + if (lock) { + pthread_cond_signal(&thread.cond_msg_avail); + pthread_mutex_unlock(&thread.queue_lock); + } +} + +/* Slice off just the part of the list that can be handled async, and + * update ex_regs. */ +static int scan_cmd_list(uint32_t *data, int count, int *last_cmd) +{ + int cmd = 0, pos = 0, len, v; + + while (pos < count) { + uint32_t *list = data + pos; + cmd = list[0] >> 24; + len = 1 + cmd_lengths[cmd]; + + switch (cmd) { + case 0x02: + break; + case 0x24 ... 0x27: + case 0x2c ... 0x2f: + case 0x34 ... 0x37: + case 0x3c ... 0x3f: + gpu.ex_regs[1] &= ~0x1ff; + gpu.ex_regs[1] |= list[4 + ((cmd >> 4) & 1)] & 0x1ff; + break; + case 0x48 ... 0x4F: + for (v = 3; pos + v < count; v++) + { + if ((list[v] & 0xf000f000) == 0x50005000) + break; + } + len += v - 3; + break; + case 0x58 ... 0x5F: + for (v = 4; pos + v < count; v += 2) + { + if ((list[v] & 0xf000f000) == 0x50005000) + break; + } + len += v - 4; + break; + default: + if ((cmd & 0xf8) == 0xe0) + gpu.ex_regs[cmd & 7] = list[0]; + break; + } + + if (pos + len > count) { + cmd = -1; + break; /* incomplete cmd */ + } + if (0xa0 <= cmd && cmd <= 0xdf) + break; /* image i/o */ + + pos += len; + } + + *last_cmd = cmd; + return pos; +} + +int do_cmd_list(uint32_t *list, int count, int *last_cmd) { + int pos = 0; + + if (thread.running) { + pos = scan_cmd_list(list, count, last_cmd); + video_thread_queue_cmd(list, pos, *last_cmd); + } else { + pos = real_do_cmd_list(list, count, last_cmd); + memcpy(gpu.ex_regs, gpu.scratch_ex_regs, sizeof(gpu.ex_regs)); + } + return pos; +} + +int renderer_init(void) { + if (thread_rendering) { + video_thread_start(); + } + return real_renderer_init(); +} + +void renderer_finish(void) { + real_renderer_finish(); + + if (thread_rendering && thread.running) { + video_thread_stop(); + } +} + +void renderer_sync_ecmds(uint32_t * ecmds) { + if (thread.running) { + int dummy; + do_cmd_list(&ecmds[1], 6, &dummy); + } else { + real_renderer_sync_ecmds(ecmds); + } +} + +void renderer_update_caches(int x, int y, int w, int h) { + renderer_sync(); + real_renderer_update_caches(x, y, w, h); +} + +void renderer_flush_queues(void) { + /* Called during DMA and updateLace. We want to sync if it's DMA, + * but not if it's updateLace. Instead of syncing here, there's a + * renderer_sync call during DMA. */ + real_renderer_flush_queues(); +} + +/* + * Normally all GPU commands are processed before rendering the + * frame. For games that naturally run < 50/60fps, this is unnecessary + * -- it forces the game to render as if it was 60fps and leaves the + * GPU idle half the time on a 30fps game, for example. + * + * Allowing the renderer to wait until a frame is done before + * rendering it would give it double, triple, or quadruple the amount + * of time to finish before we have to wait for it. + * + * We can use a heuristic to figure out when to force a render. + * + * - If a frame isn't done when we're asked to render, wait for it and + * put future GPU commands in a separate buffer (for the next frame) + * + * - If the frame is done, and had no future GPU commands, render it. + * + * - If we do have future GPU commands, it meant the frame took too + * long to render and there's another frame waiting. Stop until the + * first frame finishes, render it, and start processing the next + * one. + * + * This may possibly add a frame or two of latency that shouldn't be + * different than the real device. It may skip rendering a frame + * entirely if a VRAM transfer happens while a frame is waiting, or in + * games that natively run at 60fps if frames are coming in too + * quickly to process. Depending on how the game treats "60fps," this + * may not be noticeable. + */ +void renderer_notify_update_lace(int updated) { + if (!thread.running) return; + + if (thread_rendering == THREAD_RENDERING_SYNC) { + renderer_sync(); + return; + } + + if (updated) { + cmd_queue_swap(); + return; + } + + pthread_mutex_lock(&thread.queue_lock); + if (thread.bg_queue->used) { + /* We have commands for a future frame to run. Force a wait until + * the current frame is finished, and start processing the next + * frame after it's drawn (see the `updated` clause above). */ + pthread_mutex_unlock(&thread.queue_lock); + renderer_wait(); + pthread_mutex_lock(&thread.queue_lock); + + /* We are no longer holding commands back, so the next frame may + * get mixed into the following frame. This is usually fine, but can + * result in frameskip-like effects for 60fps games. */ + hold_cmds = false; + needs_display = true; + gpu.state.fb_dirty = true; + } else if (thread.queue->used) { + /* We are still drawing during a vblank. Cut off the current frame + * by sending new commands to the background queue and skip + * drawing our partly rendered frame to the display. */ + hold_cmds = true; + needs_display = true; + gpu.state.fb_dirty = false; + } else if (needs_display && !thread.queue->used) { + /* We have processed all commands in the queue, render the + * buffer. We know we have something to render, because + * needs_display is true. */ + hold_cmds = false; + needs_display = false; + gpu.state.fb_dirty = true; + } else { + /* Everything went normally, so do the normal thing. */ + } + + pthread_mutex_unlock(&thread.queue_lock); +} + +void renderer_set_interlace(int enable, int is_odd) { + real_renderer_set_interlace(enable, is_odd); +} + +void renderer_set_config(const struct rearmed_cbs *cbs) { + renderer_sync(); + thread_rendering = cbs->thread_rendering; + if (!thread.running && thread_rendering != THREAD_RENDERING_OFF) { + video_thread_start(); + } else if (thread.running && thread_rendering == THREAD_RENDERING_OFF) { + video_thread_stop(); + } + real_renderer_set_config(cbs); +} + +void renderer_notify_res_change(void) { + renderer_sync(); + real_renderer_notify_res_change(); +} diff --git a/plugins/gpulib/gpulib_thread_if.h b/plugins/gpulib/gpulib_thread_if.h new file mode 100644 index 0000000..b1ea97f --- /dev/null +++ b/plugins/gpulib/gpulib_thread_if.h @@ -0,0 +1,41 @@ +/************************************************************************** +* Copyright (C) 2020 The RetroArch Team * +* * +* This program is free software; you can redistribute it and/or modify * +* it under the terms of the GNU General Public License as published by * +* the Free Software Foundation; either version 2 of the License, or * +* (at your option) any later version. * +* * +* This program is distributed in the hope that it will be useful, * +* but WITHOUT ANY WARRANTY; without even the implied warranty of * +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * +* GNU General Public License for more details. * +* * +* You should have received a copy of the GNU General Public License * +* along with this program; if not, write to the * +* Free Software Foundation, Inc., * +* 51 Franklin Street, Fifth Floor, Boston, MA 02111-1307 USA. * +***************************************************************************/ + +#ifndef __GPULIB_THREAD_H__ +#define __GPULIB_THREAD_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +int real_do_cmd_list(uint32_t *list, int count, int *last_cmd); +int real_renderer_init(void); +void real_renderer_finish(void); +void real_renderer_sync_ecmds(uint32_t * ecmds); +void real_renderer_update_caches(int x, int y, int w, int h); +void real_renderer_flush_queues(void); +void real_renderer_set_interlace(int enable, int is_odd); +void real_renderer_set_config(const struct rearmed_cbs *config); +void real_renderer_notify_res_change(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __GPULIB_THREAD_H__ */ -- cgit v1.2.3