add rank handling to TTS; parse TTS args in get_av_tags()

2025-12-20 10:22:57 -05:00 · 2020-01-21 12:41:37 +10:00 · 2020-01-21 12:41:37 +10:00 · c713683f63
commit c713683f63
parent 66e277e44b
7 changed files with 145 additions and 100 deletions
--- a/proto/backend.proto
+++ b/proto/backend.proto
@ -159,6 +159,8 @@ message AVTag {
 }

 message TTSTag {
-    repeated string args = 1;
-    string text = 2;
+    string field_text = 1;
+    string lang = 2;
+    repeated string voices = 3;
+    repeated string other_args = 4;
 }
--- a/pylib/anki/rsbackend.py
+++ b/pylib/anki/rsbackend.py
@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
    if val == "sound_or_video":
        return SoundOrVideoTag(filename=tag.sound_or_video)
    else:
-        return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
+        return TTSTag(
+            field_text=tag.tts.field_text,
+            lang=tag.tts.lang,
+            voices=list(tag.tts.voices),
+            other_args=list(tag.tts.other_args),
+        )


@dataclass
--- a/pylib/anki/sound.py
+++ b/pylib/anki/sound.py
@ -21,8 +21,11 @@ class TTSTag:
    See tts.py for more information.
    """

-    args: List[str]
-    text: str
+    field_text: str
+    lang: str
+    voices: List[str]
+    # each arg should be in the form 'foo=bar'
+    other_args: List[str]


@dataclass
--- a/qt/aqt/sound.py
+++ b/qt/aqt/sound.py
@ -166,7 +166,6 @@ class AVPlayer:

        ranked.sort()

-        print(ranked)
        if ranked:
            return ranked[-1][1]
        else:
--- a/qt/aqt/tts.py
+++ b/qt/aqt/tts.py
@ -9,10 +9,18 @@ or

 {{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}

-The first argument must be a language code. If provided,
-voices is a comma-separated list of one or more voices that
-the user would prefer. Spaces must not be included.
-Underscores will be converted to spaces.
+The first argument must be a language code.
+
+If provided, voices is a comma-separated list of one or more voices that
+the user would prefer. Spaces must not be included. Underscores will be
+converted to spaces.
+
+AVPlayer decides which TTSPlayer to use based on the returned rank.
+In the default implementation, the TTS player is chosen based on the order
+of voices the user has specified. When adding new TTS players, your code
+can either expose the underlying names the TTS engine provides, or simply
+expose the name of the engine, which would mean the user could write
+{{tts en_AU voices=MyEngine}} to prioritize your engine.
 """

 from __future__ import annotations
@ -20,68 +28,82 @@ from __future__ import annotations
 import re
 import subprocess
 from dataclasses import dataclass
-from typing import List, Optional, cast
+from typing import List, Optional

 from anki.sound import AVTag, TTSTag
 from aqt.sound import SimpleProcessPlayer
-from aqt.taskman import TaskManager


@dataclass
-class TTSArgs:
-    # requested language
+class TTSVoice:
+    name: str
    lang: str
-    # preferred voices, will use first available if possible
-    voices: List[str]

-    @classmethod
-    def from_string(cls, args: List[str]) -> TTSArgs:
-        voices: Optional[List[str]] = None

-        lang = args[0]
+@dataclass
+class TTSVoiceMatch:
+    voice: TTSVoice
+    rank: int

-        for arg in args[1:]:
-            try:
-                key, val = arg.split("=")
-            except ValueError:
-                continue
-            key = key.strip()
-            val = val.strip().replace("_", " ")

-            if key == "voices":
-                voices = val.split(",")
+class TTSPlayer:
+    default_rank = 0
+    _available_voices: Optional[List[TTSVoice]] = None

-        return TTSArgs(voices=voices or [], lang=lang)
+    def get_available_voices(self) -> List[TTSVoice]:
+        return []
+
+    def voices(self) -> List[TTSVoice]:
+        if self._available_voices is None:
+            self._available_voices = self.get_available_voices()
+        return self._available_voices
+
+    def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
+        avail_voices = self.voices()
+
+        rank = self.default_rank
+
+        # any requested voices match?
+        for requested_voice in tag.voices:
+            for avail in avail_voices:
+                if avail.name == requested_voice:
+                    return TTSVoiceMatch(voice=avail, rank=rank)
+
+            rank -= 1
+
+        # if no preferred voices match, we fall back on language
+        # with a rank of -100
+        for avail in avail_voices:
+            if avail.lang == tag.lang:
+                return TTSVoiceMatch(voice=avail, rank=-100)
+
+        return None
+
+
+class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
+    def rank_for_tag(self, tag: AVTag) -> Optional[int]:
+        if not isinstance(tag, TTSTag):
+            return None
+
+        match = self.voice_for_tag(tag)
+        if match:
+            return match.rank
+        else:
+            return None


 # Mac support
 ##########################################################################


-@dataclass
-class MacVoice:
-    name: str
-    lang: str
-
-
-VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
-
-
-def parse_voice_line(line: str) -> Optional[MacVoice]:
-    m = VOICE_HELP_LINE_RE.match(line)
-    if not m:
-        return None
-    return MacVoice(name=m.group(1), lang=m.group(2))
-
-
-class MacTTSPlayer(SimpleProcessPlayer):
-    def __init__(self, taskman: TaskManager):
-        super().__init__(taskman)
-        self._available_voices: Optional[List[MacVoice]] = None
+class MacTTSPlayer(TTSProcessPlayer):
+    VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")

    def _play(self, tag: AVTag) -> None:
-        ttag = cast(TTSTag, tag)
-        voice = self.voice_for_tag(ttag)
+        assert isinstance(tag, TTSTag)
+        match = self.voice_for_tag(tag)
+        assert match
+        voice = match.voice

        self._process = subprocess.Popen(
            ["say", "-v", voice.name, "-f", "-"],
@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
            stderr=subprocess.DEVNULL,
        )
        # write the input text to stdin
-        self._process.stdin.write(ttag.text.encode("utf8"))
+        self._process.stdin.write(tag.field_text.encode("utf8"))
        self._process.stdin.close()

        self._wait_for_termination()

-    def rank_for_tag(self, tag: AVTag) -> Optional[int]:
-        if not isinstance(tag, TTSTag):
+    def get_available_voices(self) -> List[TTSVoice]:
+        cmd = subprocess.run(
+            ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
+        )
+
+        voices = []
+        for line in cmd.stdout.splitlines():
+            voice = self._parse_voice_line(line)
+            if voice:
+                voices.append(voice)
+        return voices
+
+    def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
+        m = self.VOICE_HELP_LINE_RE.match(line)
+        if not m:
            return None
-
-        # todo
-        return 0
-
-    def voices(self) -> List[MacVoice]:
-        if not self._available_voices:
-            cmd = subprocess.run(
-                ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
-            )
-            self._available_voices = []
-            for line in cmd.stdout.splitlines():
-                voice = parse_voice_line(line)
-                if voice:
-                    self._available_voices.append(voice)
-
-        return self._available_voices
-
-    def voice_for_tag(self, tag: TTSTag) -> MacVoice:
-        args = TTSArgs.from_string(tag.args)
-        voices = self.voices()
-
-        # any requested voices match?
-        for requested_voice in args.voices:
-            avail_voice = next((x for x in voices if x.name == requested_voice), None)
-            if avail_voice:
-                return avail_voice
-
-        # requested language match?
-        avail_voice = next((x for x in voices if x.lang == args.lang), None)
-        if avail_voice:
-            return avail_voice
-
-        # fall back on first voice
-        return voices[0]
+        return TTSVoice(name=m.group(1), lang=m.group(2))
--- a/rslib/src/backend.rs
+++ b/rslib/src/backend.rs
@ -188,10 +188,17 @@ impl Backend {
                AVTag::SoundOrVideo(file) => pt::AvTag {
                    value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
                },
-                AVTag::TextToSpeech { args, field_text } => pt::AvTag {
+                AVTag::TextToSpeech {
+                    field_text,
+                    lang,
+                    voices,
+                    other_args,
+                } => pt::AvTag {
                    value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
-                        args: args.iter().map(|&s| s.to_string()).collect(),
-                        text: field_text.to_string(),
+                        field_text: field_text.to_string(),
+                        lang: lang.to_string(),
+                        voices: voices.into_iter().map(ToOwned::to_owned).collect(),
+                        other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
                    })),
                },
            })
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@ -12,8 +12,10 @@ use std::ptr;
 pub enum AVTag<'a> {
    SoundOrVideo(Cow<'a, str>),
    TextToSpeech {
-        args: Vec<&'a str>,
        field_text: Cow<'a, str>,
+        lang: &'a str,
+        voices: Vec<&'a str>,
+        other_args: Vec<&'a str>,
    },
 }

@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
        } else {
            let args = caps.get(2).unwrap();
            let field_text = caps.get(3).unwrap();
-            AVTag::TextToSpeech {
-                args: args.as_str().split(' ').collect(),
-                field_text: strip_html_for_tts(field_text.as_str()),
-            }
+            tts_tag_from_string(field_text.as_str(), args.as_str())
        }
    })
 }

+fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
+    let mut other_args = vec![];
+    let mut split_args = args.split(' ');
+    let lang = split_args.next().unwrap_or("");
+    let mut voices = None;
+
+    for remaining_arg in split_args {
+        if remaining_arg.starts_with("voices=") {
+            voices = remaining_arg
+                .split('=')
+                .nth(1)
+                .map(|voices| voices.split(',').collect());
+        } else {
+            other_args.push(remaining_arg);
+        }
+    }
+
+    AVTag::TextToSpeech {
+        field_text: strip_html_for_tts(field_text),
+        lang,
+        voices: voices.unwrap_or_else(Vec::new),
+        other_args,
+    }
+}
+
 pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
    let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
    let without_html = HTML.replace_all(&without_fnames, "");
@ -153,15 +177,18 @@ mod test {

    #[test]
    fn test_audio() {
-        let s = "abc[sound:fo&amp;o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
+        let s =
+            "abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
        assert_eq!(strip_av_tags(s), "abcdefgh");
        assert_eq!(
            av_tags_in_string(s).collect::<Vec<_>>(),
            vec![
                AVTag::SoundOrVideo("fo&o.mp3".into()),
                AVTag::TextToSpeech {
-                    args: vec!["lang=en_US", "voices=Bob,Jane"],
-                    field_text: "foo 1>2".into()
+                    field_text: "foo 1>2".into(),
+                    lang: "en_US",
+                    voices: vec!["Bob", "Jane"],
+                    other_args: vec![]
                },
            ]
        );