add rank handling to TTS; parse TTS args in get_av_tags()

This commit is contained in:
Damien Elmes 2020-01-21 12:41:37 +10:00
parent 66e277e44b
commit c713683f63
7 changed files with 145 additions and 100 deletions

View file

@ -159,6 +159,8 @@ message AVTag {
} }
message TTSTag { message TTSTag {
repeated string args = 1; string field_text = 1;
string text = 2; string lang = 2;
repeated string voices = 3;
repeated string other_args = 4;
} }

View file

@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
if val == "sound_or_video": if val == "sound_or_video":
return SoundOrVideoTag(filename=tag.sound_or_video) return SoundOrVideoTag(filename=tag.sound_or_video)
else: else:
return TTSTag(args=list(tag.tts.args), text=tag.tts.text) return TTSTag(
field_text=tag.tts.field_text,
lang=tag.tts.lang,
voices=list(tag.tts.voices),
other_args=list(tag.tts.other_args),
)
@dataclass @dataclass

View file

@ -21,8 +21,11 @@ class TTSTag:
See tts.py for more information. See tts.py for more information.
""" """
args: List[str] field_text: str
text: str lang: str
voices: List[str]
# each arg should be in the form 'foo=bar'
other_args: List[str]
@dataclass @dataclass

View file

@ -166,7 +166,6 @@ class AVPlayer:
ranked.sort() ranked.sort()
print(ranked)
if ranked: if ranked:
return ranked[-1][1] return ranked[-1][1]
else: else:

View file

@ -9,10 +9,18 @@ or
{{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}} {{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
The first argument must be a language code. If provided, The first argument must be a language code.
voices is a comma-separated list of one or more voices that
the user would prefer. Spaces must not be included. If provided, voices is a comma-separated list of one or more voices that
Underscores will be converted to spaces. the user would prefer. Spaces must not be included. Underscores will be
converted to spaces.
AVPlayer decides which TTSPlayer to use based on the returned rank.
In the default implementation, the TTS player is chosen based on the order
of voices the user has specified. When adding new TTS players, your code
can either expose the underlying names the TTS engine provides, or simply
expose the name of the engine, which would mean the user could write
{{tts en_AU voices=MyEngine}} to prioritize your engine.
""" """
from __future__ import annotations from __future__ import annotations
@ -20,68 +28,82 @@ from __future__ import annotations
import re import re
import subprocess import subprocess
from dataclasses import dataclass from dataclasses import dataclass
from typing import List, Optional, cast from typing import List, Optional
from anki.sound import AVTag, TTSTag from anki.sound import AVTag, TTSTag
from aqt.sound import SimpleProcessPlayer from aqt.sound import SimpleProcessPlayer
from aqt.taskman import TaskManager
@dataclass @dataclass
class TTSArgs: class TTSVoice:
# requested language name: str
lang: str lang: str
# preferred voices, will use first available if possible
voices: List[str]
@classmethod
def from_string(cls, args: List[str]) -> TTSArgs:
voices: Optional[List[str]] = None
lang = args[0] @dataclass
class TTSVoiceMatch:
voice: TTSVoice
rank: int
for arg in args[1:]:
try:
key, val = arg.split("=")
except ValueError:
continue
key = key.strip()
val = val.strip().replace("_", " ")
if key == "voices": class TTSPlayer:
voices = val.split(",") default_rank = 0
_available_voices: Optional[List[TTSVoice]] = None
return TTSArgs(voices=voices or [], lang=lang) def get_available_voices(self) -> List[TTSVoice]:
return []
def voices(self) -> List[TTSVoice]:
if self._available_voices is None:
self._available_voices = self.get_available_voices()
return self._available_voices
def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
avail_voices = self.voices()
rank = self.default_rank
# any requested voices match?
for requested_voice in tag.voices:
for avail in avail_voices:
if avail.name == requested_voice:
return TTSVoiceMatch(voice=avail, rank=rank)
rank -= 1
# if no preferred voices match, we fall back on language
# with a rank of -100
for avail in avail_voices:
if avail.lang == tag.lang:
return TTSVoiceMatch(voice=avail, rank=-100)
return None
class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
if not isinstance(tag, TTSTag):
return None
match = self.voice_for_tag(tag)
if match:
return match.rank
else:
return None
# Mac support # Mac support
########################################################################## ##########################################################################
@dataclass class MacTTSPlayer(TTSProcessPlayer):
class MacVoice: VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
name: str
lang: str
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
def parse_voice_line(line: str) -> Optional[MacVoice]:
m = VOICE_HELP_LINE_RE.match(line)
if not m:
return None
return MacVoice(name=m.group(1), lang=m.group(2))
class MacTTSPlayer(SimpleProcessPlayer):
def __init__(self, taskman: TaskManager):
super().__init__(taskman)
self._available_voices: Optional[List[MacVoice]] = None
def _play(self, tag: AVTag) -> None: def _play(self, tag: AVTag) -> None:
ttag = cast(TTSTag, tag) assert isinstance(tag, TTSTag)
voice = self.voice_for_tag(ttag) match = self.voice_for_tag(tag)
assert match
voice = match.voice
self._process = subprocess.Popen( self._process = subprocess.Popen(
["say", "-v", voice.name, "-f", "-"], ["say", "-v", voice.name, "-f", "-"],
@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
stderr=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
) )
# write the input text to stdin # write the input text to stdin
self._process.stdin.write(ttag.text.encode("utf8")) self._process.stdin.write(tag.field_text.encode("utf8"))
self._process.stdin.close() self._process.stdin.close()
self._wait_for_termination() self._wait_for_termination()
def rank_for_tag(self, tag: AVTag) -> Optional[int]: def get_available_voices(self) -> List[TTSVoice]:
if not isinstance(tag, TTSTag): cmd = subprocess.run(
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
)
voices = []
for line in cmd.stdout.splitlines():
voice = self._parse_voice_line(line)
if voice:
voices.append(voice)
return voices
def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
m = self.VOICE_HELP_LINE_RE.match(line)
if not m:
return None return None
return TTSVoice(name=m.group(1), lang=m.group(2))
# todo
return 0
def voices(self) -> List[MacVoice]:
if not self._available_voices:
cmd = subprocess.run(
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
)
self._available_voices = []
for line in cmd.stdout.splitlines():
voice = parse_voice_line(line)
if voice:
self._available_voices.append(voice)
return self._available_voices
def voice_for_tag(self, tag: TTSTag) -> MacVoice:
args = TTSArgs.from_string(tag.args)
voices = self.voices()
# any requested voices match?
for requested_voice in args.voices:
avail_voice = next((x for x in voices if x.name == requested_voice), None)
if avail_voice:
return avail_voice
# requested language match?
avail_voice = next((x for x in voices if x.lang == args.lang), None)
if avail_voice:
return avail_voice
# fall back on first voice
return voices[0]

View file

@ -188,10 +188,17 @@ impl Backend {
AVTag::SoundOrVideo(file) => pt::AvTag { AVTag::SoundOrVideo(file) => pt::AvTag {
value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())), value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
}, },
AVTag::TextToSpeech { args, field_text } => pt::AvTag { AVTag::TextToSpeech {
field_text,
lang,
voices,
other_args,
} => pt::AvTag {
value: Some(pt::av_tag::Value::Tts(pt::TtsTag { value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
args: args.iter().map(|&s| s.to_string()).collect(), field_text: field_text.to_string(),
text: field_text.to_string(), lang: lang.to_string(),
voices: voices.into_iter().map(ToOwned::to_owned).collect(),
other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
})), })),
}, },
}) })

View file

@ -12,8 +12,10 @@ use std::ptr;
pub enum AVTag<'a> { pub enum AVTag<'a> {
SoundOrVideo(Cow<'a, str>), SoundOrVideo(Cow<'a, str>),
TextToSpeech { TextToSpeech {
args: Vec<&'a str>,
field_text: Cow<'a, str>, field_text: Cow<'a, str>,
lang: &'a str,
voices: Vec<&'a str>,
other_args: Vec<&'a str>,
}, },
} }
@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
} else { } else {
let args = caps.get(2).unwrap(); let args = caps.get(2).unwrap();
let field_text = caps.get(3).unwrap(); let field_text = caps.get(3).unwrap();
AVTag::TextToSpeech { tts_tag_from_string(field_text.as_str(), args.as_str())
args: args.as_str().split(' ').collect(),
field_text: strip_html_for_tts(field_text.as_str()),
}
} }
}) })
} }
fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
let mut other_args = vec![];
let mut split_args = args.split(' ');
let lang = split_args.next().unwrap_or("");
let mut voices = None;
for remaining_arg in split_args {
if remaining_arg.starts_with("voices=") {
voices = remaining_arg
.split('=')
.nth(1)
.map(|voices| voices.split(',').collect());
} else {
other_args.push(remaining_arg);
}
}
AVTag::TextToSpeech {
field_text: strip_html_for_tts(field_text),
lang,
voices: voices.unwrap_or_else(Vec::new),
other_args,
}
}
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> { pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
let without_fnames = IMG_TAG.replace_all(html, r" $1 "); let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
let without_html = HTML.replace_all(&without_fnames, ""); let without_html = HTML.replace_all(&without_fnames, "");
@ -153,15 +177,18 @@ mod test {
#[test] #[test]
fn test_audio() { fn test_audio() {
let s = "abc[sound:fo&amp;o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh"; let s =
"abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
assert_eq!(strip_av_tags(s), "abcdefgh"); assert_eq!(strip_av_tags(s), "abcdefgh");
assert_eq!( assert_eq!(
av_tags_in_string(s).collect::<Vec<_>>(), av_tags_in_string(s).collect::<Vec<_>>(),
vec![ vec![
AVTag::SoundOrVideo("fo&o.mp3".into()), AVTag::SoundOrVideo("fo&o.mp3".into()),
AVTag::TextToSpeech { AVTag::TextToSpeech {
args: vec!["lang=en_US", "voices=Bob,Jane"], field_text: "foo 1>2".into(),
field_text: "foo 1>2".into() lang: "en_US",
voices: vec!["Bob", "Jane"],
other_args: vec![]
}, },
] ]
); );