From c713683f631252c0d5d04fbef36ceb21c6eaaf57 Mon Sep 17 00:00:00 2001 From: Damien Elmes Date: Tue, 21 Jan 2020 12:41:37 +1000 Subject: [PATCH] add rank handling to TTS; parse TTS args in get_av_tags() --- proto/backend.proto | 6 +- pylib/anki/rsbackend.py | 7 +- pylib/anki/sound.py | 7 +- qt/aqt/sound.py | 1 - qt/aqt/tts.py | 168 ++++++++++++++++++++-------------------- rslib/src/backend.rs | 13 +++- rslib/src/text.rs | 43 ++++++++-- 7 files changed, 145 insertions(+), 100 deletions(-) diff --git a/proto/backend.proto b/proto/backend.proto index 1c8b0a1cb..b5ac530dd 100644 --- a/proto/backend.proto +++ b/proto/backend.proto @@ -159,6 +159,8 @@ message AVTag { } message TTSTag { - repeated string args = 1; - string text = 2; + string field_text = 1; + string lang = 2; + repeated string voices = 3; + repeated string other_args = 4; } diff --git a/pylib/anki/rsbackend.py b/pylib/anki/rsbackend.py index e342e1fc9..1d687a6bb 100644 --- a/pylib/anki/rsbackend.py +++ b/pylib/anki/rsbackend.py @@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag: if val == "sound_or_video": return SoundOrVideoTag(filename=tag.sound_or_video) else: - return TTSTag(args=list(tag.tts.args), text=tag.tts.text) + return TTSTag( + field_text=tag.tts.field_text, + lang=tag.tts.lang, + voices=list(tag.tts.voices), + other_args=list(tag.tts.other_args), + ) @dataclass diff --git a/pylib/anki/sound.py b/pylib/anki/sound.py index a5092333d..1671dc6ba 100644 --- a/pylib/anki/sound.py +++ b/pylib/anki/sound.py @@ -21,8 +21,11 @@ class TTSTag: See tts.py for more information. """ - args: List[str] - text: str + field_text: str + lang: str + voices: List[str] + # each arg should be in the form 'foo=bar' + other_args: List[str] @dataclass diff --git a/qt/aqt/sound.py b/qt/aqt/sound.py index 20a5dc371..a987efd22 100644 --- a/qt/aqt/sound.py +++ b/qt/aqt/sound.py @@ -166,7 +166,6 @@ class AVPlayer: ranked.sort() - print(ranked) if ranked: return ranked[-1][1] else: diff --git a/qt/aqt/tts.py b/qt/aqt/tts.py index 1ccb8813b..49bc38064 100644 --- a/qt/aqt/tts.py +++ b/qt/aqt/tts.py @@ -9,10 +9,18 @@ or {{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}} -The first argument must be a language code. If provided, -voices is a comma-separated list of one or more voices that -the user would prefer. Spaces must not be included. -Underscores will be converted to spaces. +The first argument must be a language code. + +If provided, voices is a comma-separated list of one or more voices that +the user would prefer. Spaces must not be included. Underscores will be +converted to spaces. + +AVPlayer decides which TTSPlayer to use based on the returned rank. +In the default implementation, the TTS player is chosen based on the order +of voices the user has specified. When adding new TTS players, your code +can either expose the underlying names the TTS engine provides, or simply +expose the name of the engine, which would mean the user could write +{{tts en_AU voices=MyEngine}} to prioritize your engine. """ from __future__ import annotations @@ -20,68 +28,82 @@ from __future__ import annotations import re import subprocess from dataclasses import dataclass -from typing import List, Optional, cast +from typing import List, Optional from anki.sound import AVTag, TTSTag from aqt.sound import SimpleProcessPlayer -from aqt.taskman import TaskManager @dataclass -class TTSArgs: - # requested language +class TTSVoice: + name: str lang: str - # preferred voices, will use first available if possible - voices: List[str] - @classmethod - def from_string(cls, args: List[str]) -> TTSArgs: - voices: Optional[List[str]] = None - lang = args[0] +@dataclass +class TTSVoiceMatch: + voice: TTSVoice + rank: int - for arg in args[1:]: - try: - key, val = arg.split("=") - except ValueError: - continue - key = key.strip() - val = val.strip().replace("_", " ") - if key == "voices": - voices = val.split(",") +class TTSPlayer: + default_rank = 0 + _available_voices: Optional[List[TTSVoice]] = None - return TTSArgs(voices=voices or [], lang=lang) + def get_available_voices(self) -> List[TTSVoice]: + return [] + + def voices(self) -> List[TTSVoice]: + if self._available_voices is None: + self._available_voices = self.get_available_voices() + return self._available_voices + + def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]: + avail_voices = self.voices() + + rank = self.default_rank + + # any requested voices match? + for requested_voice in tag.voices: + for avail in avail_voices: + if avail.name == requested_voice: + return TTSVoiceMatch(voice=avail, rank=rank) + + rank -= 1 + + # if no preferred voices match, we fall back on language + # with a rank of -100 + for avail in avail_voices: + if avail.lang == tag.lang: + return TTSVoiceMatch(voice=avail, rank=-100) + + return None + + +class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer): + def rank_for_tag(self, tag: AVTag) -> Optional[int]: + if not isinstance(tag, TTSTag): + return None + + match = self.voice_for_tag(tag) + if match: + return match.rank + else: + return None # Mac support ########################################################################## -@dataclass -class MacVoice: - name: str - lang: str - - -VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$") - - -def parse_voice_line(line: str) -> Optional[MacVoice]: - m = VOICE_HELP_LINE_RE.match(line) - if not m: - return None - return MacVoice(name=m.group(1), lang=m.group(2)) - - -class MacTTSPlayer(SimpleProcessPlayer): - def __init__(self, taskman: TaskManager): - super().__init__(taskman) - self._available_voices: Optional[List[MacVoice]] = None +class MacTTSPlayer(TTSProcessPlayer): + VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$") def _play(self, tag: AVTag) -> None: - ttag = cast(TTSTag, tag) - voice = self.voice_for_tag(ttag) + assert isinstance(tag, TTSTag) + match = self.voice_for_tag(tag) + assert match + voice = match.voice self._process = subprocess.Popen( ["say", "-v", voice.name, "-f", "-"], @@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer): stderr=subprocess.DEVNULL, ) # write the input text to stdin - self._process.stdin.write(ttag.text.encode("utf8")) + self._process.stdin.write(tag.field_text.encode("utf8")) self._process.stdin.close() self._wait_for_termination() - def rank_for_tag(self, tag: AVTag) -> Optional[int]: - if not isinstance(tag, TTSTag): + def get_available_voices(self) -> List[TTSVoice]: + cmd = subprocess.run( + ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8" + ) + + voices = [] + for line in cmd.stdout.splitlines(): + voice = self._parse_voice_line(line) + if voice: + voices.append(voice) + return voices + + def _parse_voice_line(self, line: str) -> Optional[TTSVoice]: + m = self.VOICE_HELP_LINE_RE.match(line) + if not m: return None - - # todo - return 0 - - def voices(self) -> List[MacVoice]: - if not self._available_voices: - cmd = subprocess.run( - ["say", "-v", "?"], capture_output=True, check=True, encoding="utf8" - ) - self._available_voices = [] - for line in cmd.stdout.splitlines(): - voice = parse_voice_line(line) - if voice: - self._available_voices.append(voice) - - return self._available_voices - - def voice_for_tag(self, tag: TTSTag) -> MacVoice: - args = TTSArgs.from_string(tag.args) - voices = self.voices() - - # any requested voices match? - for requested_voice in args.voices: - avail_voice = next((x for x in voices if x.name == requested_voice), None) - if avail_voice: - return avail_voice - - # requested language match? - avail_voice = next((x for x in voices if x.lang == args.lang), None) - if avail_voice: - return avail_voice - - # fall back on first voice - return voices[0] + return TTSVoice(name=m.group(1), lang=m.group(2)) diff --git a/rslib/src/backend.rs b/rslib/src/backend.rs index 470ca4316..18b475c48 100644 --- a/rslib/src/backend.rs +++ b/rslib/src/backend.rs @@ -188,10 +188,17 @@ impl Backend { AVTag::SoundOrVideo(file) => pt::AvTag { value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())), }, - AVTag::TextToSpeech { args, field_text } => pt::AvTag { + AVTag::TextToSpeech { + field_text, + lang, + voices, + other_args, + } => pt::AvTag { value: Some(pt::av_tag::Value::Tts(pt::TtsTag { - args: args.iter().map(|&s| s.to_string()).collect(), - text: field_text.to_string(), + field_text: field_text.to_string(), + lang: lang.to_string(), + voices: voices.into_iter().map(ToOwned::to_owned).collect(), + other_args: other_args.into_iter().map(ToOwned::to_owned).collect(), })), }, }) diff --git a/rslib/src/text.rs b/rslib/src/text.rs index f07fe1f48..b3743d8d2 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -12,8 +12,10 @@ use std::ptr; pub enum AVTag<'a> { SoundOrVideo(Cow<'a, str>), TextToSpeech { - args: Vec<&'a str>, field_text: Cow<'a, str>, + lang: &'a str, + voices: Vec<&'a str>, + other_args: Vec<&'a str>, }, } @@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator { } else { let args = caps.get(2).unwrap(); let field_text = caps.get(3).unwrap(); - AVTag::TextToSpeech { - args: args.as_str().split(' ').collect(), - field_text: strip_html_for_tts(field_text.as_str()), - } + tts_tag_from_string(field_text.as_str(), args.as_str()) } }) } +fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> { + let mut other_args = vec![]; + let mut split_args = args.split(' '); + let lang = split_args.next().unwrap_or(""); + let mut voices = None; + + for remaining_arg in split_args { + if remaining_arg.starts_with("voices=") { + voices = remaining_arg + .split('=') + .nth(1) + .map(|voices| voices.split(',').collect()); + } else { + other_args.push(remaining_arg); + } + } + + AVTag::TextToSpeech { + field_text: strip_html_for_tts(field_text), + lang, + voices: voices.unwrap_or_else(Vec::new), + other_args, + } +} + pub fn strip_html_preserving_image_filenames(html: &str) -> Cow { let without_fnames = IMG_TAG.replace_all(html, r" $1 "); let without_html = HTML.replace_all(&without_fnames, ""); @@ -153,15 +177,18 @@ mod test { #[test] fn test_audio() { - let s = "abc[sound:fo&o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo
1>2[/anki:tts]gh"; + let s = + "abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo
1>2[/anki:tts]gh"; assert_eq!(strip_av_tags(s), "abcdefgh"); assert_eq!( av_tags_in_string(s).collect::>(), vec![ AVTag::SoundOrVideo("fo&o.mp3".into()), AVTag::TextToSpeech { - args: vec!["lang=en_US", "voices=Bob,Jane"], - field_text: "foo 1>2".into() + field_text: "foo 1>2".into(), + lang: "en_US", + voices: vec!["Bob", "Jane"], + other_args: vec![] }, ] );