Source/WebCore/Modules/speech/cocoa/WebSpeechRecognizerTask.mm - WebKit - Git at Google

 /*
  * Copyright (C) 2020 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  * THE POSSIBILITY OF SUCH DAMAGE.
  */

 #import "config.h"
 #import "WebSpeechRecognizerTask.h"

 #if HAVE(SPEECHRECOGNIZER)

 #import <pal/spi/cocoa/SpeechSPI.h>
 #import <wtf/BlockPtr.h>
 #import <wtf/WeakObjCPtr.h>

 #import <pal/cocoa/SpeechSoftLink.h>

 // Set the maximum duration to be an hour; we can adjust this if needed.
 static constexpr size_t maximumRecognitionDuration = 60 * 60;

 NS_ASSUME_NONNULL_BEGIN

 @interface WebSpeechRecognizerTaskImpl : NSObject<SFSpeechRecognitionTaskDelegate, SFSpeechRecognizerDelegate> {
 @private
     WebCore::SpeechRecognitionConnectionClientIdentifier _identifier;
     BlockPtr<void(const WebCore::SpeechRecognitionUpdate&)> _delegateCallback;
     bool _doMultipleRecognitions;
     uint64_t _maxAlternatives;
     RetainPtr<SFSpeechRecognizer> _recognizer;
     RetainPtr<SFSpeechAudioBufferRecognitionRequest> _request;
     WeakObjCPtr<SFSpeechRecognitionTask> _task;
     bool _hasSentSpeechStart;
     bool _hasSentSpeechEnd;
     bool _hasSentEnd;
 }

 - (instancetype)initWithIdentifier:(WebCore::SpeechRecognitionConnectionClientIdentifier)identifier locale:(NSString*)localeIdentifier doMultipleRecognitions:(BOOL)continuous reportInterimResults:(BOOL)interimResults maxAlternatives:(unsigned long)alternatives delegateCallback:(void(^)(const WebCore::SpeechRecognitionUpdate&))callback;
 - (void)callbackWithTranscriptions:(NSArray<SFTranscription *> *)transcriptions isFinal:(BOOL)isFinal;
 - (void)audioSamplesAvailable:(CMSampleBufferRef)sampleBuffer;
 - (void)abort;
 - (void)stop;
 - (void)sendSpeechStartIfNeeded;
 - (void)sendSpeechEndIfNeeded;
 - (void)sendEndIfNeeded;

 @end

 @implementation WebSpeechRecognizerTaskImpl

 - (instancetype)initWithIdentifier:(WebCore::SpeechRecognitionConnectionClientIdentifier)identifier locale:(NSString*)localeIdentifier doMultipleRecognitions:(BOOL)continuous reportInterimResults:(BOOL)interimResults maxAlternatives:(unsigned long)alternatives delegateCallback:(void(^)(const WebCore::SpeechRecognitionUpdate&))callback
 {
     if (!(self = [super init]))
         return nil;

     _identifier = identifier;
     _doMultipleRecognitions = continuous;
     _delegateCallback = callback;
     _hasSentSpeechStart = false;
     _hasSentSpeechEnd = false;
     _hasSentEnd = false;

     _maxAlternatives = alternatives ? alternatives : 1;

     if (![localeIdentifier length])
         _recognizer = adoptNS([PAL::allocSFSpeechRecognizerInstance() init]);
     else
         _recognizer = adoptNS([PAL::allocSFSpeechRecognizerInstance() initWithLocale:[NSLocale localeWithLocaleIdentifier:localeIdentifier]]);
     if (!_recognizer) {
         [self release];
         return nil;
     }

     if (![_recognizer isAvailable]) {
         [self release];
         return nil;
     }

     [_recognizer setDelegate:self];

     _request = adoptNS([PAL::allocSFSpeechAudioBufferRecognitionRequestInstance() init]);
     if ([_recognizer supportsOnDeviceRecognition])
         [_request setRequiresOnDeviceRecognition:YES];
     [_request setShouldReportPartialResults:interimResults];
     [_request setTaskHint:SFSpeechRecognitionTaskHintDictation];

 #if USE(APPLE_INTERNAL_SDK)
     [_request setDetectMultipleUtterances:YES];
     [_request _setMaximumRecognitionDuration:maximumRecognitionDuration];
 #endif

     _task = [_recognizer recognitionTaskWithRequest:_request.get() delegate:self];
     return self;
 }

 - (void)callbackWithTranscriptions:(NSArray<SFTranscription *> *)transcriptions isFinal:(BOOL)isFinal
 {
     Vector<WebCore::SpeechRecognitionAlternativeData> alternatives;
     alternatives.reserveInitialCapacity(_maxAlternatives);
     for (SFTranscription* transcription in transcriptions) {
         // FIXME: <rdar://73629573> get confidence of SFTranscription when possible.
         double maxConfidence = 0.0;
         for (SFTranscriptionSegment* segment in [transcription segments]) {
             double confidence = [segment confidence];
             maxConfidence = maxConfidence < confidence ? confidence : maxConfidence;
         }
         alternatives.uncheckedAppend(WebCore::SpeechRecognitionAlternativeData { [transcription formattedString], maxConfidence });
         if (alternatives.size() == _maxAlternatives)
             break;
     }
     _delegateCallback(WebCore::SpeechRecognitionUpdate::createResult(_identifier, { WebCore::SpeechRecognitionResultData { WTFMove(alternatives), !!isFinal } }));
 }

 - (void)audioSamplesAvailable:(CMSampleBufferRef)sampleBuffer
 {
     ASSERT(isMainThread());
     [_request appendAudioSampleBuffer:sampleBuffer];
 }

 - (void)abort
 {
     if (!_task || [_task state] == SFSpeechRecognitionTaskStateCanceling)
         return;

     if ([_task state] == SFSpeechRecognitionTaskStateCompleted) {
         [self sendSpeechEndIfNeeded];
         [self sendEndIfNeeded];
         return;
     }

     [self sendSpeechEndIfNeeded];
     [_request endAudio];
     [_task cancel];
 }

 - (void)stop
 {
     if (!_task || [_task state] == SFSpeechRecognitionTaskStateCanceling)
         return;

     if ([_task state] == SFSpeechRecognitionTaskStateCompleted) {
         [self sendSpeechEndIfNeeded];
         [self sendEndIfNeeded];
         return;
     }

     [self sendSpeechEndIfNeeded];
     [_request endAudio];
     [_task finish];
 }

 - (void)sendSpeechStartIfNeeded
 {
     if (_hasSentSpeechStart)
         return;

     _hasSentSpeechStart = true;
     _delegateCallback(WebCore::SpeechRecognitionUpdate::create(_identifier, WebCore::SpeechRecognitionUpdateType::SpeechStart));
 }

 - (void)sendSpeechEndIfNeeded
 {
     if (!_hasSentSpeechStart || _hasSentSpeechEnd)
         return;

     _hasSentSpeechEnd = true;
     _delegateCallback(WebCore::SpeechRecognitionUpdate::create(_identifier, WebCore::SpeechRecognitionUpdateType::SpeechEnd));
 }

 - (void)sendEndIfNeeded
 {
     if (_hasSentEnd)
         return;

     _hasSentEnd = true;
     _delegateCallback(WebCore::SpeechRecognitionUpdate::create(_identifier, WebCore::SpeechRecognitionUpdateType::End));
 }

 #pragma mark SFSpeechRecognizerDelegate

 - (void)speechRecognizer:(SFSpeechRecognizer *)speechRecognizer availabilityDidChange:(BOOL)available
 {
     ASSERT(isMainThread());

     if (available || !_task)
         return;

     auto error = WebCore::SpeechRecognitionError { WebCore::SpeechRecognitionErrorType::ServiceNotAllowed, "Speech recognition service becomes unavailable"_s };
     _delegateCallback(WebCore::SpeechRecognitionUpdate::createError(_identifier, WTFMove(error)));
 }

 #pragma mark SFSpeechRecognitionTaskDelegate

 - (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didHypothesizeTranscription:(SFTranscription *)transcription
 {
     ASSERT(isMainThread());

     [self sendSpeechStartIfNeeded];
     [self callbackWithTranscriptions:[NSArray arrayWithObjects:transcription, nil] isFinal:NO];
 }

 - (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishRecognition:(SFSpeechRecognitionResult *)recognitionResult
 {
     ASSERT(isMainThread());
     [self callbackWithTranscriptions:recognitionResult.transcriptions isFinal:YES];

     if (!_doMultipleRecognitions) {
         [self sendSpeechEndIfNeeded];
         [self sendEndIfNeeded];
     }
 }

 - (void)speechRecognitionTaskWasCancelled:(SFSpeechRecognitionTask *)task
 {
     ASSERT(isMainThread());

     [self sendSpeechEndIfNeeded];
     [self sendEndIfNeeded];
 }

 - (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishSuccessfully:(BOOL)successfully
 {
     ASSERT(isMainThread());

     if (!successfully) {
         auto error = WebCore::SpeechRecognitionError { WebCore::SpeechRecognitionErrorType::Aborted, task.error.localizedDescription };
         _delegateCallback(WebCore::SpeechRecognitionUpdate::createError(_identifier, WTFMove(error)));
     }

     [self sendEndIfNeeded];
 }

 @end

 @implementation WebSpeechRecognizerTask

 - (instancetype)initWithIdentifier:(WebCore::SpeechRecognitionConnectionClientIdentifier)identifier locale:(NSString*)localeIdentifier doMultipleRecognitions:(BOOL)continuous reportInterimResults:(BOOL)interimResults maxAlternatives:(unsigned long)alternatives delegateCallback:(void(^)(const WebCore::SpeechRecognitionUpdate&))callback
 {
     if (!(self = [super init]))
         return nil;

     _impl = adoptNS([[WebSpeechRecognizerTaskImpl alloc] initWithIdentifier:identifier locale:localeIdentifier doMultipleRecognitions:continuous reportInterimResults:interimResults maxAlternatives:alternatives delegateCallback:callback]);

     if (!_impl) {
         [self release];
         return nil;
     }

     return self;
 }

 - (void)audioSamplesAvailable:(CMSampleBufferRef)sampleBuffer
 {
     [_impl audioSamplesAvailable:sampleBuffer];
 }

 - (void)abort
 {
     [_impl abort];
 }

 - (void)stop
 {
     [_impl stop];
 }

 @end

 NS_ASSUME_NONNULL_END

 #endif
	/*
	* Copyright (C) 2020 Apple Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	* are met:
	* 1. Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* 2. Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	*
	* THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS''
	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
	* THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
	* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS
	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	* THE POSSIBILITY OF SUCH DAMAGE.
	*/

	#import "config.h"
	#import "WebSpeechRecognizerTask.h"

	#if HAVE(SPEECHRECOGNIZER)

	#import <pal/spi/cocoa/SpeechSPI.h>
	#import <wtf/BlockPtr.h>
	#import <wtf/WeakObjCPtr.h>

	#import <pal/cocoa/SpeechSoftLink.h>

	// Set the maximum duration to be an hour; we can adjust this if needed.
	static constexpr size_t maximumRecognitionDuration = 60 * 60;

	NS_ASSUME_NONNULL_BEGIN

	@interface WebSpeechRecognizerTaskImpl : NSObject<SFSpeechRecognitionTaskDelegate, SFSpeechRecognizerDelegate> {
	@private
	WebCore::SpeechRecognitionConnectionClientIdentifier _identifier;
	BlockPtr<void(const WebCore::SpeechRecognitionUpdate&)> _delegateCallback;
	bool _doMultipleRecognitions;
	uint64_t _maxAlternatives;
	RetainPtr<SFSpeechRecognizer> _recognizer;
	RetainPtr<SFSpeechAudioBufferRecognitionRequest> _request;
	WeakObjCPtr<SFSpeechRecognitionTask> _task;
	bool _hasSentSpeechStart;
	bool _hasSentSpeechEnd;
	bool _hasSentEnd;
	}

	- (instancetype)initWithIdentifier:(WebCore::SpeechRecognitionConnectionClientIdentifier)identifier locale:(NSString*)localeIdentifier doMultipleRecognitions:(BOOL)continuous reportInterimResults:(BOOL)interimResults maxAlternatives:(unsigned long)alternatives delegateCallback:(void(^)(const WebCore::SpeechRecognitionUpdate&))callback;
	- (void)callbackWithTranscriptions:(NSArray<SFTranscription > )transcriptions isFinal:(BOOL)isFinal;
	- (void)audioSamplesAvailable:(CMSampleBufferRef)sampleBuffer;
	- (void)abort;
	- (void)stop;
	- (void)sendSpeechStartIfNeeded;
	- (void)sendSpeechEndIfNeeded;
	- (void)sendEndIfNeeded;

	@end

	@implementation WebSpeechRecognizerTaskImpl

	- (instancetype)initWithIdentifier:(WebCore::SpeechRecognitionConnectionClientIdentifier)identifier locale:(NSString*)localeIdentifier doMultipleRecognitions:(BOOL)continuous reportInterimResults:(BOOL)interimResults maxAlternatives:(unsigned long)alternatives delegateCallback:(void(^)(const WebCore::SpeechRecognitionUpdate&))callback
	{
	if (!(self = [super init]))
	return nil;

	_identifier = identifier;
	_doMultipleRecognitions = continuous;
	_delegateCallback = callback;
	_hasSentSpeechStart = false;
	_hasSentSpeechEnd = false;
	_hasSentEnd = false;

	_maxAlternatives = alternatives ? alternatives : 1;

	if (![localeIdentifier length])
	_recognizer = adoptNS([PAL::allocSFSpeechRecognizerInstance() init]);
	else
	_recognizer = adoptNS([PAL::allocSFSpeechRecognizerInstance() initWithLocale:[NSLocale localeWithLocaleIdentifier:localeIdentifier]]);
	if (!_recognizer) {
	[self release];
	return nil;
	}

	if (![_recognizer isAvailable]) {
	[self release];
	return nil;
	}

	[_recognizer setDelegate:self];

	_request = adoptNS([PAL::allocSFSpeechAudioBufferRecognitionRequestInstance() init]);
	if ([_recognizer supportsOnDeviceRecognition])
	[_request setRequiresOnDeviceRecognition:YES];
	[_request setShouldReportPartialResults:interimResults];
	[_request setTaskHint:SFSpeechRecognitionTaskHintDictation];

	#if USE(APPLE_INTERNAL_SDK)
	[_request setDetectMultipleUtterances:YES];
	[_request _setMaximumRecognitionDuration:maximumRecognitionDuration];
	#endif

	_task = [_recognizer recognitionTaskWithRequest:_request.get() delegate:self];
	return self;
	}

	- (void)callbackWithTranscriptions:(NSArray<SFTranscription > )transcriptions isFinal:(BOOL)isFinal
	{
	Vector<WebCore::SpeechRecognitionAlternativeData> alternatives;
	alternatives.reserveInitialCapacity(_maxAlternatives);
	for (SFTranscription* transcription in transcriptions) {
	// FIXME: <rdar://73629573> get confidence of SFTranscription when possible.
	double maxConfidence = 0.0;
	for (SFTranscriptionSegment* segment in [transcription segments]) {
	double confidence = [segment confidence];
	maxConfidence = maxConfidence < confidence ? confidence : maxConfidence;
	}
	alternatives.uncheckedAppend(WebCore::SpeechRecognitionAlternativeData { [transcription formattedString], maxConfidence });
	if (alternatives.size() == _maxAlternatives)
	break;
	}
	_delegateCallback(WebCore::SpeechRecognitionUpdate::createResult(_identifier, { WebCore::SpeechRecognitionResultData { WTFMove(alternatives), !!isFinal } }));
	}

	- (void)audioSamplesAvailable:(CMSampleBufferRef)sampleBuffer
	{
	ASSERT(isMainThread());
	[_request appendAudioSampleBuffer:sampleBuffer];
	}

	- (void)abort
	{
	if (!_task \|\| [_task state] == SFSpeechRecognitionTaskStateCanceling)
	return;

	if ([_task state] == SFSpeechRecognitionTaskStateCompleted) {
	[self sendSpeechEndIfNeeded];
	[self sendEndIfNeeded];
	return;
	}

	[self sendSpeechEndIfNeeded];
	[_request endAudio];
	[_task cancel];
	}

	- (void)stop
	{
	if (!_task \|\| [_task state] == SFSpeechRecognitionTaskStateCanceling)
	return;

	if ([_task state] == SFSpeechRecognitionTaskStateCompleted) {
	[self sendSpeechEndIfNeeded];
	[self sendEndIfNeeded];
	return;
	}

	[self sendSpeechEndIfNeeded];
	[_request endAudio];
	[_task finish];
	}

	- (void)sendSpeechStartIfNeeded
	{
	if (_hasSentSpeechStart)
	return;

	_hasSentSpeechStart = true;
	_delegateCallback(WebCore::SpeechRecognitionUpdate::create(_identifier, WebCore::SpeechRecognitionUpdateType::SpeechStart));
	}

	- (void)sendSpeechEndIfNeeded
	{
	if (!_hasSentSpeechStart \|\| _hasSentSpeechEnd)
	return;

	_hasSentSpeechEnd = true;
	_delegateCallback(WebCore::SpeechRecognitionUpdate::create(_identifier, WebCore::SpeechRecognitionUpdateType::SpeechEnd));
	}

	- (void)sendEndIfNeeded
	{
	if (_hasSentEnd)
	return;

	_hasSentEnd = true;
	_delegateCallback(WebCore::SpeechRecognitionUpdate::create(_identifier, WebCore::SpeechRecognitionUpdateType::End));
	}

	#pragma mark SFSpeechRecognizerDelegate

	- (void)speechRecognizer:(SFSpeechRecognizer *)speechRecognizer availabilityDidChange:(BOOL)available
	{
	ASSERT(isMainThread());

	if (available \|\| !_task)
	return;

	auto error = WebCore::SpeechRecognitionError { WebCore::SpeechRecognitionErrorType::ServiceNotAllowed, "Speech recognition service becomes unavailable"_s };
	_delegateCallback(WebCore::SpeechRecognitionUpdate::createError(_identifier, WTFMove(error)));
	}

	#pragma mark SFSpeechRecognitionTaskDelegate

	- (void)speechRecognitionTask:(SFSpeechRecognitionTask )task didHypothesizeTranscription:(SFTranscription )transcription
	{
	ASSERT(isMainThread());

	[self sendSpeechStartIfNeeded];
	[self callbackWithTranscriptions:[NSArray arrayWithObjects:transcription, nil] isFinal:NO];
	}

	- (void)speechRecognitionTask:(SFSpeechRecognitionTask )task didFinishRecognition:(SFSpeechRecognitionResult )recognitionResult
	{
	ASSERT(isMainThread());
	[self callbackWithTranscriptions:recognitionResult.transcriptions isFinal:YES];

	if (!_doMultipleRecognitions) {
	[self sendSpeechEndIfNeeded];
	[self sendEndIfNeeded];
	}
	}

	- (void)speechRecognitionTaskWasCancelled:(SFSpeechRecognitionTask *)task
	{
	ASSERT(isMainThread());

	[self sendSpeechEndIfNeeded];
	[self sendEndIfNeeded];
	}

	- (void)speechRecognitionTask:(SFSpeechRecognitionTask *)task didFinishSuccessfully:(BOOL)successfully
	{
	ASSERT(isMainThread());

	if (!successfully) {
	auto error = WebCore::SpeechRecognitionError { WebCore::SpeechRecognitionErrorType::Aborted, task.error.localizedDescription };
	_delegateCallback(WebCore::SpeechRecognitionUpdate::createError(_identifier, WTFMove(error)));
	}

	[self sendEndIfNeeded];
	}

	@end

	@implementation WebSpeechRecognizerTask

	- (instancetype)initWithIdentifier:(WebCore::SpeechRecognitionConnectionClientIdentifier)identifier locale:(NSString*)localeIdentifier doMultipleRecognitions:(BOOL)continuous reportInterimResults:(BOOL)interimResults maxAlternatives:(unsigned long)alternatives delegateCallback:(void(^)(const WebCore::SpeechRecognitionUpdate&))callback
	{
	if (!(self = [super init]))
	return nil;

	_impl = adoptNS([[WebSpeechRecognizerTaskImpl alloc] initWithIdentifier:identifier locale:localeIdentifier doMultipleRecognitions:continuous reportInterimResults:interimResults maxAlternatives:alternatives delegateCallback:callback]);

	if (!_impl) {
	[self release];
	return nil;
	}

	return self;
	}

	- (void)audioSamplesAvailable:(CMSampleBufferRef)sampleBuffer
	{
	[_impl audioSamplesAvailable:sampleBuffer];
	}

	- (void)abort
	{
	[_impl abort];
	}

	- (void)stop
	{
	[_impl stop];
	}

	@end

	NS_ASSUME_NONNULL_END

	#endif