From b1bffaba8682af2870d4ff019bedbceb72440707 Mon Sep 17 00:00:00 2001
From: Jaromir Wysoglad
Date: Mon, 12 Aug 2019 23:33:54 -0700
Subject: TTS: Implement our own queuing on windows.

Similarly as on linux, there isn't enough control of the speech
queue to properly implement INTERRUPT_NO_REPEAT. So since this
commit we use our own queuing and use SAPI to speak each speech.
This is done outside the main thread.
---
 .../windows/windows-text-to-speech.cpp             | 172 ++++++++++++++++-----
 .../windows/windows-text-to-speech.h               |  13 ++
 2 files changed, 149 insertions(+), 36 deletions(-)

diff --git a/backends/text-to-speech/windows/windows-text-to-speech.cpp b/backends/text-to-speech/windows/windows-text-to-speech.cpp
index 7c8f879189..ec3961dd6b 100644
--- a/backends/text-to-speech/windows/windows-text-to-speech.cpp
+++ b/backends/text-to-speech/windows/windows-text-to-speech.cpp
@@ -50,6 +50,15 @@ ISpAudio *_audio;
 WindowsTextToSpeechManager::WindowsTextToSpeechManager()
 	: _speechState(BROKEN){
 	init();
+	_threadParams.queue = &_speechQueue;
+	_threadParams.state = &_speechState;
+	_threadParams.mutex = &_speechMutex;
+	_thread = NULL;
+	_speechMutex = CreateMutex(NULL, FALSE, NULL);
+	if (_speechMutex == NULL) {
+		_speechState = BROKEN;
+		warning("Could not create TTS mutex");
+	}
 }
 
 void WindowsTextToSpeechManager::init() {
@@ -84,19 +93,67 @@ void WindowsTextToSpeechManager::init() {
 
 	_voice->SetOutput(_audio, FALSE);
 
-	if(_ttsState->_availableVoices.size() > 0)
+	if (_ttsState->_availableVoices.size() > 0)
 		_speechState = READY;
 	else
 		_speechState = NO_VOICE;
 	_lastSaid = "";
+	while (!_speechQueue.empty()) {
+		free(_speechQueue.front());
+		_speechQueue.pop_front();
+	}
 }
 
 WindowsTextToSpeechManager::~WindowsTextToSpeechManager() {
+	stop();
+	if (_thread != NULL) {
+		WaitForSingleObject(_thread, INFINITE);
+		CloseHandle(_thread);
+	}
+	if (_speechMutex != NULL) {
+		CloseHandle(_speechMutex);
+	}
 	if (_voice)
 		_voice->Release();
 	::CoUninitialize();
 }
 
+DWORD WINAPI startSpeech(LPVOID parameters) {
+	WindowsTextToSpeechManager::SpeechParameters *params =
+		(WindowsTextToSpeechManager::SpeechParameters *) parameters;
+	// wait for the previous speech, if the previous thread exited too early
+	_voice->WaitUntilDone(INFINITE);
+
+	while (!params->queue->empty()) {
+		WaitForSingleObject(*params->mutex, INFINITE);
+		// check again, when we have exclusive access to the queue
+		if (params->queue->empty() || *(params->state) == WindowsTextToSpeechManager::PAUSED) {
+			break;
+		}
+		WCHAR *currentSpeech = params->queue->front();
+		_voice->Speak(currentSpeech, SPF_PURGEBEFORESPEAK | SPF_ASYNC, 0);
+		ReleaseMutex(*params->mutex);
+
+		while (*(params->state) != WindowsTextToSpeechManager::PAUSED)
+			if (_voice->WaitUntilDone(10) == S_OK)
+				break;
+
+		WaitForSingleObject(*params->mutex, INFINITE);
+		if (!params->queue->empty() && params->queue->front() == currentSpeech) {
+			if (currentSpeech != NULL)
+				free(currentSpeech);
+			params->queue->pop_front();
+		}
+		ReleaseMutex(*params->mutex);
+	}
+
+	WaitForSingleObject(*params->mutex, INFINITE);
+	if (*(params->state) != WindowsTextToSpeechManager::PAUSED)
+		*(params->state) = WindowsTextToSpeechManager::READY;
+	ReleaseMutex(*params->mutex);
+	return 0;
+}
+
 bool WindowsTextToSpeechManager::say(Common::String str, Action action, Common::String charset) {
 	if (_speechState == BROKEN || _speechState == NO_VOICE) {
 		warning("The tts cannot speak in this state");
@@ -106,12 +163,6 @@ bool WindowsTextToSpeechManager::say(Common::String str, Action action, Common::
 	if (isSpeaking() && action == DROP)
 		return true;
 
-	if (isSpeaking() && action == INTERRUPT_NO_REPEAT && _lastSaid == str)
-		return true;
-
-	if (isSpeaking() && action == QUEUE_NO_REPEAT && _lastSaid == str)
-		return true;
-
 	if (charset.empty()) {
 #ifdef USE_TRANSLATION
 		charset = TransMan.getCurrentCharset();
@@ -119,65 +170,114 @@ bool WindowsTextToSpeechManager::say(Common::String str, Action action, Common::
 		charset = "ASCII";
 #endif
 	}
-	_lastSaid = str;
+
 	// We have to set the pitch by prepending xml code at the start of the said string;
 	Common::String pitch= Common::String::format("<pitch absmiddle=\"%d\">", _ttsState->_pitch / 10);
 	str.replace((uint32)0, 0, pitch);
 	WCHAR *strW = Win32::ansiToUnicode(str.c_str(), Win32::getCodePageId(charset));
 
-	if ((isPaused() || isSpeaking()) && (action == INTERRUPT || action == INTERRUPT_NO_REPEAT))
+	WaitForSingleObject(_speechMutex, INFINITE);
+	if (isSpeaking() && !_speechQueue.empty() && action == INTERRUPT_NO_REPEAT &&
+			_speechQueue.front() != NULL && !wcscmp(_speechQueue.front(), strW)) {
+		while (_speechQueue.size() != 1) {
+			free(_speechQueue.back());
+			_speechQueue.pop_back();
+		}
+		free(strW);
+		ReleaseMutex(_speechMutex);
+		return true;
+	}
+
+	if (isSpeaking() && !_speechQueue.empty() && action == QUEUE_NO_REPEAT &&
+			_speechQueue.front() != NULL &&!wcscmp(_speechQueue.back(), strW)) {
+		ReleaseMutex(_speechMutex);
+		return true;
+	}
+
+	ReleaseMutex(_speechMutex);
+	if ((isPaused() || isSpeaking()) && (action == INTERRUPT || action == INTERRUPT_NO_REPEAT)) {
 		stop();
+	}
 
-	bool result = _voice->Speak(strW, SPF_ASYNC, NULL) != S_OK;
-	free(strW);
-	if (!isPaused())
+	WaitForSingleObject(_speechMutex, INFINITE);
+	_speechQueue.push_back(strW);
+	ReleaseMutex(_speechMutex);
+
+	if (!isSpeaking() && !isPaused()) {
+		DWORD threadId;
+		if (_thread != NULL) {
+			WaitForSingleObject(_thread, INFINITE);
+			CloseHandle(_thread);
+		}
 		_speechState = SPEAKING;
-	return result;
+		_thread = CreateThread(NULL, 0, startSpeech, &_threadParams, 0, &threadId);
+		if (_thread == NULL) {
+			warning("Could not create speech thread");
+			_speechState = READY;
+			return true;
+		}
+	}
+	return false;
 }
 
 bool WindowsTextToSpeechManager::stop() {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return true;
 	if (isPaused())
 		resume();
 	_audio->SetState(SPAS_STOP, 0);
+	WaitForSingleObject(_speechMutex, INFINITE);
+	while (!_speechQueue.empty()) {
+		if (_speechQueue.front() != NULL)
+		free(_speechQueue.front());
+		_speechQueue.pop_front();
+	}
+	_speechQueue.push_back(NULL);
+	ReleaseMutex(_speechMutex);
+	if (_thread != NULL) {
+		WaitForSingleObject(_thread, INFINITE);
+		CloseHandle(_thread);
+		_thread = NULL;
+	}
 	_audio->SetState(SPAS_RUN, 0);
-	_voice->Speak(NULL, SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0);
-	_speechState = READY;
 	return false;
 }
 
 bool WindowsTextToSpeechManager::pause() {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return true;
 	if (isPaused())
 		return false;
+	WaitForSingleObject(_speechMutex, INFINITE);
 	_voice->Pause();
 	_speechState = PAUSED;
+	ReleaseMutex(_speechMutex);
 	return false;
 }
 
 bool WindowsTextToSpeechManager::resume() {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return true;
 	if (!isPaused())
 		return false;
 	_voice->Resume();
-	if (isSpeaking())
-		_speechState = SPEAKING;
-	else
+	DWORD threadId;
+	if (_thread != NULL) {
+		WaitForSingleObject(_thread, INFINITE);
+		CloseHandle(_thread);
+	}
+	_speechState = SPEAKING;
+	_thread = CreateThread(NULL, 0, startSpeech, &_threadParams, 0, &threadId);
+	if (_thread == NULL) {
+		warning("Could not create speech thread");
 		_speechState = READY;
+		return true;
+	}
 	return false;
 }
 
 bool WindowsTextToSpeechManager::isSpeaking() {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
-		return false;
-	SPAUDIOSTATUS audioStatus;
-	SPVOICESTATUS voiceStatus;
-	_audio->GetStatus(&audioStatus);
-	_voice->GetStatus(&voiceStatus, NULL);
-	return audioStatus.State != SPAS_CLOSED || voiceStatus.dwRunningState != SPRS_DONE;
+	return _speechState == SPEAKING;
 }
 
 bool WindowsTextToSpeechManager::isPaused() {
@@ -185,7 +285,7 @@ bool WindowsTextToSpeechManager::isPaused() {
 }
 
 bool WindowsTextToSpeechManager::isReady() {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return false;
 	if (_speechState != PAUSED && !isSpeaking())
 		return true;
@@ -194,14 +294,14 @@ bool WindowsTextToSpeechManager::isReady() {
 }
 
 void WindowsTextToSpeechManager::setVoice(unsigned index) {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return;
 	_voice->SetVoice((ISpObjectToken *) _ttsState->_availableVoices[index].getData());
 	_ttsState->_activeVoice = index;
 }
 
 void WindowsTextToSpeechManager::setRate(int rate) {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return;
 	assert(rate >= -100 && rate <= 100);
 	_voice->SetRate(rate / 10);
@@ -209,14 +309,14 @@ void WindowsTextToSpeechManager::setRate(int rate) {
 }
 
 void WindowsTextToSpeechManager::setPitch(int pitch) {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return;
 	assert(pitch >= -100 && pitch <= 100);
 	_ttsState->_pitch = pitch;
 }
 
 void WindowsTextToSpeechManager::setVolume(unsigned volume) {
-	if(_speechState == BROKEN || _speechState == NO_VOICE)
+	if (_speechState == BROKEN || _speechState == NO_VOICE)
 		return;
 	assert(volume <= 100);
 	_voice->SetVolume(volume);
@@ -306,7 +406,7 @@ void WindowsTextToSpeechManager::createVoice(void *cpVoiceToken) {
 int strToInt(Common::String str) {
 	str.toUppercase();
 	int result = 0;
-	for(unsigned i = 0; i < str.size(); i++) {
+	for (unsigned i = 0; i < str.size(); i++) {
 		if (str[i] < '0' || (str[i] > '9' && str[i] < 'A') || str[i] > 'F')
 			break;
 		int num = (str[i] <= '9') ? str[i] - '0' : str[i] - 55;
@@ -342,7 +442,7 @@ void WindowsTextToSpeechManager::updateVoices() {
 	while (SUCCEEDED(hr) && ulCount--) {
 		hr = cpEnum->Next(1, &cpVoiceToken, NULL);
 		_voice->SetVoice(cpVoiceToken);
-		if(SUCCEEDED(_voice->Speak(L"hi, this is test", SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0)))
+		if (SUCCEEDED(_voice->Speak(L"hi, this is test", SPF_PURGEBEFORESPEAK | SPF_ASYNC | SPF_IS_NOT_XML, 0)))
 			createVoice(cpVoiceToken);
 		else
 			cpVoiceToken->Release();
@@ -355,7 +455,7 @@ void WindowsTextToSpeechManager::updateVoices() {
 	_voice->SetVolume(_ttsState->_volume);
 	cpEnum->Release();
 
-	if(_ttsState->_availableVoices.size() == 0) {
+	if (_ttsState->_availableVoices.size() == 0) {
 		_speechState = NO_VOICE;
 		warning("No voice is available");
 	} else if (_speechState == NO_VOICE)
diff --git a/backends/text-to-speech/windows/windows-text-to-speech.h b/backends/text-to-speech/windows/windows-text-to-speech.h
index d8968243ec..cbf3eb07df 100644
--- a/backends/text-to-speech/windows/windows-text-to-speech.h
+++ b/backends/text-to-speech/windows/windows-text-to-speech.h
@@ -29,6 +29,8 @@
 
 #include "common/text-to-speech.h"
 #include "common/str.h"
+#include "common/list.h"
+
 
 class WindowsTextToSpeechManager : public Common::TextToSpeechManager {
 public:
@@ -40,6 +42,12 @@ public:
 		NO_VOICE
 	};
 
+	struct SpeechParameters {
+		Common::List<WCHAR *> *queue;
+		SpeechState *state;
+		HANDLE *mutex;
+	};
+
 	WindowsTextToSpeechManager();
 	virtual ~WindowsTextToSpeechManager();
 
@@ -72,8 +80,13 @@ private:
 	Common::String lcidToLocale(Common::String lcid);
 	SpeechState _speechState;
 	Common::String _lastSaid;
+	HANDLE _thread;
+	Common::List<WCHAR *> _speechQueue;
+	SpeechParameters _threadParams;
+	HANDLE _speechMutex;
 };
 
+
 #endif
 
 #endif // BACKENDS_UPDATES_WINDOWS_H
-- 
cgit v1.2.3