const SpeechRecognition =
  window.SpeechRecognition || window.webkitSpeechRecognition;

class SpeechToText {
  finalTranscript = "";
  private resolver: (() => void) | undefined;
  private promise: Promise<void> | undefined;
  private speechRecognition: SpeechRecognition | undefined;

  constructor() {
    if (!SpeechRecognition) {
      return;
    }

    this.speechRecognition = new SpeechRecognition();
    this.speechRecognition.continuous = true;
    this.speechRecognition.interimResults = false;
    this.speechRecognition.onerror = (e) => console.error(e);
    this.speechRecognition.onresult = (e) => this.handleResult(e);
    this.speechRecognition.onend = () => this.handleNoMatch();
  }

  handleNoMatch() {
    if (!this.resolver) {
      return;
    }
    this.resolver();
  }

  handleResult(event: SpeechRecognitionEvent) {
    for (let i = event.resultIndex; i < event.results.length; ++i) {
      if (event.results[i].isFinal) {
        this.finalTranscript += event.results[i][0].transcript;
      }
    }
  }

  start() {
    const that = this;
    this.promise = new Promise<void>((resolve) => {
      that.resolver = resolve;
    });

    if (!this.speechRecognition && this.resolver) {
      this.resolver();
      return;
    }

    this.speechRecognition?.start();
  }

  async stop() {
    if (!this.resolver || !this.promise) {
      throw new Error("Missing resolver, did start recording?");
    }

    this.speechRecognition?.stop();
    await this.promise;
    const text = this.finalTranscript;
    this.finalTranscript = "";
    return text;
  }
}

export const speechToText = new SpeechToText();
