add rank handling to TTS; parse TTS args in get_av_tags()

2025-12-20 10:22:57 -05:00 · 2020-01-21 12:41:37 +10:00 · 2020-01-21 12:41:37 +10:00 · c713683f63
commit c713683f63
parent 66e277e44b
7 changed files with 145 additions and 100 deletions
--- a/proto/backend.proto
+++ b/proto/backend.proto
@ -159,6 +159,8 @@ message AVTag {
 }
 message TTSTag {
-    repeated string args = 1;
+    string field_text = 1;
-    string text = 2;
+    string lang = 2;
    repeated string voices = 3;
    repeated string other_args = 4;
 }
--- a/pylib/anki/rsbackend.py
+++ b/pylib/anki/rsbackend.py
@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
    if val == "sound_or_video":
        return SoundOrVideoTag(filename=tag.sound_or_video)
    else:
-        return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
+        return TTSTag(
            field_text=tag.tts.field_text,
            lang=tag.tts.lang,
            voices=list(tag.tts.voices),
            other_args=list(tag.tts.other_args),
        )
@dataclass
--- a/pylib/anki/sound.py
+++ b/pylib/anki/sound.py
@ -21,8 +21,11 @@ class TTSTag:
    See tts.py for more information.
    """
-    args: List[str]
+    field_text: str
-    text: str
+    lang: str
    voices: List[str]
    # each arg should be in the form 'foo=bar'
    other_args: List[str]
@dataclass
--- a/qt/aqt/sound.py
+++ b/qt/aqt/sound.py
@ -166,7 +166,6 @@ class AVPlayer:
        ranked.sort()
        print(ranked)
        if ranked:
            return ranked[-1][1]
        else:
--- a/qt/aqt/tts.py
+++ b/qt/aqt/tts.py
@ -9,10 +9,18 @@ or
 {{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
-The first argument must be a language code. If provided,
+The first argument must be a language code.
-voices is a comma-separated list of one or more voices that
+
-the user would prefer. Spaces must not be included.
+If provided, voices is a comma-separated list of one or more voices that
-Underscores will be converted to spaces.
+the user would prefer. Spaces must not be included. Underscores will be
 converted to spaces.
 AVPlayer decides which TTSPlayer to use based on the returned rank.
 In the default implementation, the TTS player is chosen based on the order
 of voices the user has specified. When adding new TTS players, your code
 can either expose the underlying names the TTS engine provides, or simply
 expose the name of the engine, which would mean the user could write
 {{tts en_AU voices=MyEngine}} to prioritize your engine.
 """
 from __future__ import annotations
@ -20,68 +28,82 @@ from __future__ import annotations
 import re
 import subprocess
 from dataclasses import dataclass
-from typing import List, Optional, cast
+from typing import List, Optional
 from anki.sound import AVTag, TTSTag
 from aqt.sound import SimpleProcessPlayer
 from aqt.taskman import TaskManager
@dataclass
-class TTSArgs:
+class TTSVoice:
-    # requested language
+    name: str
    lang: str
    # preferred voices, will use first available if possible
    voices: List[str]
    @classmethod
    def from_string(cls, args: List[str]) -> TTSArgs:
        voices: Optional[List[str]] = None
-        lang = args[0]
+@dataclass
 class TTSVoiceMatch:
    voice: TTSVoice
    rank: int
        for arg in args[1:]:
            try:
                key, val = arg.split("=")
            except ValueError:
                continue
            key = key.strip()
            val = val.strip().replace("_", " ")
-            if key == "voices":
+class TTSPlayer:
-                voices = val.split(",")
+    default_rank = 0
    _available_voices: Optional[List[TTSVoice]] = None
-        return TTSArgs(voices=voices or [], lang=lang)
+    def get_available_voices(self) -> List[TTSVoice]:
        return []
    def voices(self) -> List[TTSVoice]:
        if self._available_voices is None:
            self._available_voices = self.get_available_voices()
        return self._available_voices
    def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
        avail_voices = self.voices()
        rank = self.default_rank
        # any requested voices match?
        for requested_voice in tag.voices:
            for avail in avail_voices:
                if avail.name == requested_voice:
                    return TTSVoiceMatch(voice=avail, rank=rank)
            rank -= 1
        # if no preferred voices match, we fall back on language
        # with a rank of -100
        for avail in avail_voices:
            if avail.lang == tag.lang:
                return TTSVoiceMatch(voice=avail, rank=-100)
        return None
 class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
    def rank_for_tag(self, tag: AVTag) -> Optional[int]:
        if not isinstance(tag, TTSTag):
            return None
        match = self.voice_for_tag(tag)
        if match:
            return match.rank
        else:
            return None
 # Mac support
 ##########################################################################
-@dataclass
+class MacTTSPlayer(TTSProcessPlayer):
-class MacVoice:
+    VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
    name: str
    lang: str
 VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
 def parse_voice_line(line: str) -> Optional[MacVoice]:
    m = VOICE_HELP_LINE_RE.match(line)
    if not m:
        return None
    return MacVoice(name=m.group(1), lang=m.group(2))
 class MacTTSPlayer(SimpleProcessPlayer):
    def __init__(self, taskman: TaskManager):
        super().__init__(taskman)
        self._available_voices: Optional[List[MacVoice]] = None
    def _play(self, tag: AVTag) -> None:
-        ttag = cast(TTSTag, tag)
+        assert isinstance(tag, TTSTag)
-        voice = self.voice_for_tag(ttag)
+        match = self.voice_for_tag(tag)
        assert match
        voice = match.voice
        self._process = subprocess.Popen(
            ["say", "-v", voice.name, "-f", "-"],
@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
            stderr=subprocess.DEVNULL,
        )
        # write the input text to stdin
-        self._process.stdin.write(ttag.text.encode("utf8"))
+        self._process.stdin.write(tag.field_text.encode("utf8"))
        self._process.stdin.close()
        self._wait_for_termination()
-    def rank_for_tag(self, tag: AVTag) -> Optional[int]:
+    def get_available_voices(self) -> List[TTSVoice]:
-        if not isinstance(tag, TTSTag):
+        cmd = subprocess.run(
            ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
        )
        voices = []
        for line in cmd.stdout.splitlines():
            voice = self._parse_voice_line(line)
            if voice:
                voices.append(voice)
        return voices
    def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
        m = self.VOICE_HELP_LINE_RE.match(line)
        if not m:
            return None
-
+        return TTSVoice(name=m.group(1), lang=m.group(2))
        # todo
        return 0
    def voices(self) -> List[MacVoice]:
        if not self._available_voices:
            cmd = subprocess.run(
                ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
            )
            self._available_voices = []
            for line in cmd.stdout.splitlines():
                voice = parse_voice_line(line)
                if voice:
                    self._available_voices.append(voice)
        return self._available_voices
    def voice_for_tag(self, tag: TTSTag) -> MacVoice:
        args = TTSArgs.from_string(tag.args)
        voices = self.voices()
        # any requested voices match?
        for requested_voice in args.voices:
            avail_voice = next((x for x in voices if x.name == requested_voice), None)
            if avail_voice:
                return avail_voice
        # requested language match?
        avail_voice = next((x for x in voices if x.lang == args.lang), None)
        if avail_voice:
            return avail_voice
        # fall back on first voice
        return voices[0]
--- a/rslib/src/backend.rs
+++ b/rslib/src/backend.rs
@ -188,10 +188,17 @@ impl Backend {
                AVTag::SoundOrVideo(file) => pt::AvTag {
                    value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
                },
-                AVTag::TextToSpeech { args, field_text } => pt::AvTag {
+                AVTag::TextToSpeech {
                    field_text,
                    lang,
                    voices,
                    other_args,
                } => pt::AvTag {
                    value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
-                        args: args.iter().map(|&s| s.to_string()).collect(),
+                        field_text: field_text.to_string(),
-                        text: field_text.to_string(),
+                        lang: lang.to_string(),
                        voices: voices.into_iter().map(ToOwned::to_owned).collect(),
                        other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
                    })),
                },
            })
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@ -12,8 +12,10 @@ use std::ptr;
 pub enum AVTag<'a> {
    SoundOrVideo(Cow<'a, str>),
    TextToSpeech {
        args: Vec<&'a str>,
        field_text: Cow<'a, str>,
        lang: &'a str,
        voices: Vec<&'a str>,
        other_args: Vec<&'a str>,
    },
 }
@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
        } else {
            let args = caps.get(2).unwrap();
            let field_text = caps.get(3).unwrap();
-            AVTag::TextToSpeech {
+            tts_tag_from_string(field_text.as_str(), args.as_str())
                args: args.as_str().split(' ').collect(),
                field_text: strip_html_for_tts(field_text.as_str()),
            }
        }
    })
 }
 fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
    let mut other_args = vec![];
    let mut split_args = args.split(' ');
    let lang = split_args.next().unwrap_or("");
    let mut voices = None;
    for remaining_arg in split_args {
        if remaining_arg.starts_with("voices=") {
            voices = remaining_arg
                .split('=')
                .nth(1)
                .map(|voices| voices.split(',').collect());
        } else {
            other_args.push(remaining_arg);
        }
    }
    AVTag::TextToSpeech {
        field_text: strip_html_for_tts(field_text),
        lang,
        voices: voices.unwrap_or_else(Vec::new),
        other_args,
    }
 }
 pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
    let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
    let without_html = HTML.replace_all(&without_fnames, "");
@ -153,15 +177,18 @@ mod test {
    #[test]
    fn test_audio() {
-        let s = "abc[sound:fo&amp;o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
+        let s =
            "abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
        assert_eq!(strip_av_tags(s), "abcdefgh");
        assert_eq!(
            av_tags_in_string(s).collect::<Vec<_>>(),
            vec![
                AVTag::SoundOrVideo("fo&o.mp3".into()),
                AVTag::TextToSpeech {
-                    args: vec!["lang=en_US", "voices=Bob,Jane"],
+                    field_text: "foo 1>2".into(),
-                    field_text: "foo 1>2".into()
+                    lang: "en_US",
                    voices: vec!["Bob", "Jane"],
                    other_args: vec![]
                },
            ]
        );