From c713683f631252c0d5d04fbef36ceb21c6eaaf57 Mon Sep 17 00:00:00 2001
From: Damien Elmes <gpg@ankiweb.net>
Date: Tue, 21 Jan 2020 12:41:37 +1000
Subject: [PATCH] add rank handling to TTS; parse TTS args in get_av_tags()

---
 proto/backend.proto     |   6 +-
 pylib/anki/rsbackend.py |   7 +-
 pylib/anki/sound.py     |   7 +-
 qt/aqt/sound.py         |   1 -
 qt/aqt/tts.py           | 168 ++++++++++++++++++++--------------------
 rslib/src/backend.rs    |  13 +++-
 rslib/src/text.rs       |  43 ++++++++--
 7 files changed, 145 insertions(+), 100 deletions(-)

diff --git a/proto/backend.proto b/proto/backend.proto
index 1c8b0a1cb..b5ac530dd 100644
--- a/proto/backend.proto
+++ b/proto/backend.proto
@@ -159,6 +159,8 @@ message AVTag {
 }
 
 message TTSTag {
-    repeated string args = 1;
-    string text = 2;
+    string field_text = 1;
+    string lang = 2;
+    repeated string voices = 3;
+    repeated string other_args = 4;
 }
diff --git a/pylib/anki/rsbackend.py b/pylib/anki/rsbackend.py
index e342e1fc9..1d687a6bb 100644
--- a/pylib/anki/rsbackend.py
+++ b/pylib/anki/rsbackend.py
@@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
     if val == "sound_or_video":
         return SoundOrVideoTag(filename=tag.sound_or_video)
     else:
-        return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
+        return TTSTag(
+            field_text=tag.tts.field_text,
+            lang=tag.tts.lang,
+            voices=list(tag.tts.voices),
+            other_args=list(tag.tts.other_args),
+        )
 
 
 @dataclass
diff --git a/pylib/anki/sound.py b/pylib/anki/sound.py
index a5092333d..1671dc6ba 100644
--- a/pylib/anki/sound.py
+++ b/pylib/anki/sound.py
@@ -21,8 +21,11 @@ class TTSTag:
     See tts.py for more information.
     """
 
-    args: List[str]
-    text: str
+    field_text: str
+    lang: str
+    voices: List[str]
+    # each arg should be in the form 'foo=bar'
+    other_args: List[str]
 
 
 @dataclass
diff --git a/qt/aqt/sound.py b/qt/aqt/sound.py
index 20a5dc371..a987efd22 100644
--- a/qt/aqt/sound.py
+++ b/qt/aqt/sound.py
@@ -166,7 +166,6 @@ class AVPlayer:
 
         ranked.sort()
 
-        print(ranked)
         if ranked:
             return ranked[-1][1]
         else:
diff --git a/qt/aqt/tts.py b/qt/aqt/tts.py
index 1ccb8813b..49bc38064 100644
--- a/qt/aqt/tts.py
+++ b/qt/aqt/tts.py
@@ -9,10 +9,18 @@ or
 
 {{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
 
-The first argument must be a language code. If provided,
-voices is a comma-separated list of one or more voices that
-the user would prefer. Spaces must not be included.
-Underscores will be converted to spaces.
+The first argument must be a language code.
+
+If provided, voices is a comma-separated list of one or more voices that
+the user would prefer. Spaces must not be included. Underscores will be
+converted to spaces.
+
+AVPlayer decides which TTSPlayer to use based on the returned rank.
+In the default implementation, the TTS player is chosen based on the order
+of voices the user has specified. When adding new TTS players, your code
+can either expose the underlying names the TTS engine provides, or simply
+expose the name of the engine, which would mean the user could write
+{{tts en_AU voices=MyEngine}} to prioritize your engine.
 """
 
 from __future__ import annotations
@@ -20,68 +28,82 @@ from __future__ import annotations
 import re
 import subprocess
 from dataclasses import dataclass
-from typing import List, Optional, cast
+from typing import List, Optional
 
 from anki.sound import AVTag, TTSTag
 from aqt.sound import SimpleProcessPlayer
-from aqt.taskman import TaskManager
 
 
 @dataclass
-class TTSArgs:
-    # requested language
+class TTSVoice:
+    name: str
     lang: str
-    # preferred voices, will use first available if possible
-    voices: List[str]
 
-    @classmethod
-    def from_string(cls, args: List[str]) -> TTSArgs:
-        voices: Optional[List[str]] = None
 
-        lang = args[0]
+@dataclass
+class TTSVoiceMatch:
+    voice: TTSVoice
+    rank: int
 
-        for arg in args[1:]:
-            try:
-                key, val = arg.split("=")
-            except ValueError:
-                continue
-            key = key.strip()
-            val = val.strip().replace("_", " ")
 
-            if key == "voices":
-                voices = val.split(",")
+class TTSPlayer:
+    default_rank = 0
+    _available_voices: Optional[List[TTSVoice]] = None
 
-        return TTSArgs(voices=voices or [], lang=lang)
+    def get_available_voices(self) -> List[TTSVoice]:
+        return []
+
+    def voices(self) -> List[TTSVoice]:
+        if self._available_voices is None:
+            self._available_voices = self.get_available_voices()
+        return self._available_voices
+
+    def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
+        avail_voices = self.voices()
+
+        rank = self.default_rank
+
+        # any requested voices match?
+        for requested_voice in tag.voices:
+            for avail in avail_voices:
+                if avail.name == requested_voice:
+                    return TTSVoiceMatch(voice=avail, rank=rank)
+
+            rank -= 1
+
+        # if no preferred voices match, we fall back on language
+        # with a rank of -100
+        for avail in avail_voices:
+            if avail.lang == tag.lang:
+                return TTSVoiceMatch(voice=avail, rank=-100)
+
+        return None
+
+
+class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
+    def rank_for_tag(self, tag: AVTag) -> Optional[int]:
+        if not isinstance(tag, TTSTag):
+            return None
+
+        match = self.voice_for_tag(tag)
+        if match:
+            return match.rank
+        else:
+            return None
 
 
 # Mac support
 ##########################################################################
 
 
-@dataclass
-class MacVoice:
-    name: str
-    lang: str
-
-
-VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
-
-
-def parse_voice_line(line: str) -> Optional[MacVoice]:
-    m = VOICE_HELP_LINE_RE.match(line)
-    if not m:
-        return None
-    return MacVoice(name=m.group(1), lang=m.group(2))
-
-
-class MacTTSPlayer(SimpleProcessPlayer):
-    def __init__(self, taskman: TaskManager):
-        super().__init__(taskman)
-        self._available_voices: Optional[List[MacVoice]] = None
+class MacTTSPlayer(TTSProcessPlayer):
+    VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
 
     def _play(self, tag: AVTag) -> None:
-        ttag = cast(TTSTag, tag)
-        voice = self.voice_for_tag(ttag)
+        assert isinstance(tag, TTSTag)
+        match = self.voice_for_tag(tag)
+        assert match
+        voice = match.voice
 
         self._process = subprocess.Popen(
             ["say", "-v", voice.name, "-f", "-"],
@@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
             stderr=subprocess.DEVNULL,
         )
         # write the input text to stdin
-        self._process.stdin.write(ttag.text.encode("utf8"))
+        self._process.stdin.write(tag.field_text.encode("utf8"))
         self._process.stdin.close()
 
         self._wait_for_termination()
 
-    def rank_for_tag(self, tag: AVTag) -> Optional[int]:
-        if not isinstance(tag, TTSTag):
+    def get_available_voices(self) -> List[TTSVoice]:
+        cmd = subprocess.run(
+            ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
+        )
+
+        voices = []
+        for line in cmd.stdout.splitlines():
+            voice = self._parse_voice_line(line)
+            if voice:
+                voices.append(voice)
+        return voices
+
+    def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
+        m = self.VOICE_HELP_LINE_RE.match(line)
+        if not m:
             return None
-
-        # todo
-        return 0
-
-    def voices(self) -> List[MacVoice]:
-        if not self._available_voices:
-            cmd = subprocess.run(
-                ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
-            )
-            self._available_voices = []
-            for line in cmd.stdout.splitlines():
-                voice = parse_voice_line(line)
-                if voice:
-                    self._available_voices.append(voice)
-
-        return self._available_voices
-
-    def voice_for_tag(self, tag: TTSTag) -> MacVoice:
-        args = TTSArgs.from_string(tag.args)
-        voices = self.voices()
-
-        # any requested voices match?
-        for requested_voice in args.voices:
-            avail_voice = next((x for x in voices if x.name == requested_voice), None)
-            if avail_voice:
-                return avail_voice
-
-        # requested language match?
-        avail_voice = next((x for x in voices if x.lang == args.lang), None)
-        if avail_voice:
-            return avail_voice
-
-        # fall back on first voice
-        return voices[0]
+        return TTSVoice(name=m.group(1), lang=m.group(2))
diff --git a/rslib/src/backend.rs b/rslib/src/backend.rs
index 470ca4316..18b475c48 100644
--- a/rslib/src/backend.rs
+++ b/rslib/src/backend.rs
@@ -188,10 +188,17 @@ impl Backend {
                 AVTag::SoundOrVideo(file) => pt::AvTag {
                     value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
                 },
-                AVTag::TextToSpeech { args, field_text } => pt::AvTag {
+                AVTag::TextToSpeech {
+                    field_text,
+                    lang,
+                    voices,
+                    other_args,
+                } => pt::AvTag {
                     value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
-                        args: args.iter().map(|&s| s.to_string()).collect(),
-                        text: field_text.to_string(),
+                        field_text: field_text.to_string(),
+                        lang: lang.to_string(),
+                        voices: voices.into_iter().map(ToOwned::to_owned).collect(),
+                        other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
                     })),
                 },
             })
diff --git a/rslib/src/text.rs b/rslib/src/text.rs
index f07fe1f48..b3743d8d2 100644
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@@ -12,8 +12,10 @@ use std::ptr;
 pub enum AVTag<'a> {
     SoundOrVideo(Cow<'a, str>),
     TextToSpeech {
-        args: Vec<&'a str>,
         field_text: Cow<'a, str>,
+        lang: &'a str,
+        voices: Vec<&'a str>,
+        other_args: Vec<&'a str>,
     },
 }
 
@@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
         } else {
             let args = caps.get(2).unwrap();
             let field_text = caps.get(3).unwrap();
-            AVTag::TextToSpeech {
-                args: args.as_str().split(' ').collect(),
-                field_text: strip_html_for_tts(field_text.as_str()),
-            }
+            tts_tag_from_string(field_text.as_str(), args.as_str())
         }
     })
 }
 
+fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
+    let mut other_args = vec![];
+    let mut split_args = args.split(' ');
+    let lang = split_args.next().unwrap_or("");
+    let mut voices = None;
+
+    for remaining_arg in split_args {
+        if remaining_arg.starts_with("voices=") {
+            voices = remaining_arg
+                .split('=')
+                .nth(1)
+                .map(|voices| voices.split(',').collect());
+        } else {
+            other_args.push(remaining_arg);
+        }
+    }
+
+    AVTag::TextToSpeech {
+        field_text: strip_html_for_tts(field_text),
+        lang,
+        voices: voices.unwrap_or_else(Vec::new),
+        other_args,
+    }
+}
+
 pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
     let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
     let without_html = HTML.replace_all(&without_fnames, "");
@@ -153,15 +177,18 @@ mod test {
 
     #[test]
     fn test_audio() {
-        let s = "abc[sound:fo&amp;o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
+        let s =
+            "abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
         assert_eq!(strip_av_tags(s), "abcdefgh");
         assert_eq!(
             av_tags_in_string(s).collect::<Vec<_>>(),
             vec![
                 AVTag::SoundOrVideo("fo&o.mp3".into()),
                 AVTag::TextToSpeech {
-                    args: vec!["lang=en_US", "voices=Bob,Jane"],
-                    field_text: "foo 1>2".into()
+                    field_text: "foo 1>2".into(),
+                    lang: "en_US",
+                    voices: vec!["Bob", "Jane"],
+                    other_args: vec![]
                 },
             ]
         );