blob: 847c16209b8957dcb3f6772f3064e959a410d95c [file] [log] [blame]
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#ifndef COMPONENTS_SPEECH_SPEECH_RECOGNIZER_FSM_H_
#define COMPONENTS_SPEECH_SPEECH_RECOGNIZER_FSM_H_
#include <vector>
#include "base/memory/ref_counted.h"
#include "components/speech/audio_buffer.h"
#include "media/mojo/mojom/audio_data.mojom.h"
#include "media/mojo/mojom/speech_recognition_error.mojom.h"
#include "media/mojo/mojom/speech_recognition_recognition_context.h"
#include "media/mojo/mojom/speech_recognition_result.mojom.h"
namespace speech {
// Interface for the speech recognizer finite-state machine used to power the
// Web Speech API.
// TODO(crbug.com/40286514): Remove the scoped_refptr usage.
class SpeechRecognizerFsm {
public:
// The Finite State Machine states of the recognizer in sequential order.
enum FSMState {
STATE_IDLE = 0,
STATE_PREPARING,
STATE_STARTING,
STATE_ESTIMATING_ENVIRONMENT,
STATE_WAITING_FOR_SPEECH,
STATE_RECOGNIZING,
STATE_WAITING_FINAL_RESULT,
STATE_ENDED,
STATE_MAX_VALUE = STATE_ENDED
};
// The Finite State Machine events used by the recognizer. Event dispatching
// must be sequential, otherwise it will break the rules and the assumptions
// of the finite state automata model.
enum FSMEvent {
EVENT_ABORT = 0,
EVENT_PREPARE,
EVENT_START,
EVENT_STOP_CAPTURE,
EVENT_AUDIO_DATA,
EVENT_ENGINE_RESULT,
EVENT_UPDATE_RECOGNITION_CONTEXT,
EVENT_ENGINE_ERROR,
EVENT_AUDIO_ERROR,
EVENT_MAX_VALUE = EVENT_AUDIO_ERROR
};
struct FSMEventArgs {
explicit FSMEventArgs(FSMEvent event_value);
FSMEventArgs(const FSMEventArgs& other);
~FSMEventArgs();
FSMEvent event;
media::mojom::AudioDataS16Ptr audio_data;
scoped_refptr<AudioChunk> audio_chunk;
std::vector<media::mojom::WebSpeechRecognitionResultPtr> engine_results;
media::SpeechRecognitionRecognitionContext recognition_context;
media::mojom::SpeechRecognitionError engine_error;
};
// Defines the behavior of the recognizer FSM, selecting the appropriate
// transition according to the current state and event.
FSMState ExecuteTransitionAndGetNextState(const FSMEventArgs& args);
// Entry point for pushing any new external event into the recognizer FSM.
virtual void DispatchEvent(const FSMEventArgs& event_args) = 0;
// The methods below handle transitions of the recognizer FSM.
virtual void ProcessAudioPipeline(const FSMEventArgs& event_args) = 0;
virtual FSMState PrepareRecognition(const FSMEventArgs&) = 0;
virtual FSMState StartRecording(const FSMEventArgs& event_args) = 0;
virtual FSMState StartRecognitionEngine(const FSMEventArgs& event_args) = 0;
virtual FSMState WaitEnvironmentEstimationCompletion(
const FSMEventArgs& event_args) = 0;
virtual FSMState DetectUserSpeechOrTimeout(
const FSMEventArgs& event_args) = 0;
virtual FSMState StopCaptureAndWaitForResult(
const FSMEventArgs& event_args) = 0;
virtual FSMState ProcessIntermediateResult(
const FSMEventArgs& event_args) = 0;
virtual FSMState ProcessFinalResult(const FSMEventArgs& event_args) = 0;
virtual FSMState AbortSilently(const FSMEventArgs& event_args) = 0;
virtual FSMState AbortWithError(const FSMEventArgs& event_args) = 0;
virtual FSMState Abort(const media::mojom::SpeechRecognitionError& error) = 0;
virtual FSMState DetectEndOfSpeech(const FSMEventArgs& event_args) = 0;
virtual FSMState UpdateRecognitionContext(const FSMEventArgs& event_args) = 0;
virtual FSMState DoNothing(const FSMEventArgs& event_args) const = 0;
virtual FSMState NotFeasible(const FSMEventArgs& event_args) = 0;
protected:
friend class base::RefCountedThreadSafe<SpeechRecognizerFsm>;
virtual ~SpeechRecognizerFsm() = default;
FSMState state_ = STATE_IDLE;
bool is_dispatching_event_ = false;
};
} // namespace speech
#endif // COMPONENTS_SPEECH_SPEECH_RECOGNIZER_FSM_H_