add rank handling to TTS; parse TTS args in get_av_tags()

This commit is contained in:
Damien Elmes 2020-01-21 12:41:37 +10:00
parent 66e277e44b
commit c713683f63
7 changed files with 145 additions and 100 deletions

View file

@ -159,6 +159,8 @@ message AVTag {
}
message TTSTag {
repeated string args = 1;
string text = 2;
string field_text = 1;
string lang = 2;
repeated string voices = 3;
repeated string other_args = 4;
}

View file

@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
if val == "sound_or_video":
return SoundOrVideoTag(filename=tag.sound_or_video)
else:
return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
return TTSTag(
field_text=tag.tts.field_text,
lang=tag.tts.lang,
voices=list(tag.tts.voices),
other_args=list(tag.tts.other_args),
)
@dataclass

View file

@ -21,8 +21,11 @@ class TTSTag:
See tts.py for more information.
"""
args: List[str]
text: str
field_text: str
lang: str
voices: List[str]
# each arg should be in the form 'foo=bar'
other_args: List[str]
@dataclass

View file

@ -166,7 +166,6 @@ class AVPlayer:
ranked.sort()
print(ranked)
if ranked:
return ranked[-1][1]
else:

View file

@ -9,10 +9,18 @@ or
{{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
The first argument must be a language code. If provided,
voices is a comma-separated list of one or more voices that
the user would prefer. Spaces must not be included.
Underscores will be converted to spaces.
The first argument must be a language code.
If provided, voices is a comma-separated list of one or more voices that
the user would prefer. Spaces must not be included. Underscores will be
converted to spaces.
AVPlayer decides which TTSPlayer to use based on the returned rank.
In the default implementation, the TTS player is chosen based on the order
of voices the user has specified. When adding new TTS players, your code
can either expose the underlying names the TTS engine provides, or simply
expose the name of the engine, which would mean the user could write
{{tts en_AU voices=MyEngine}} to prioritize your engine.
"""
from __future__ import annotations
@ -20,68 +28,82 @@ from __future__ import annotations
import re
import subprocess
from dataclasses import dataclass
from typing import List, Optional, cast
from typing import List, Optional
from anki.sound import AVTag, TTSTag
from aqt.sound import SimpleProcessPlayer
from aqt.taskman import TaskManager
@dataclass
class TTSArgs:
# requested language
class TTSVoice:
name: str
lang: str
# preferred voices, will use first available if possible
voices: List[str]
@classmethod
def from_string(cls, args: List[str]) -> TTSArgs:
voices: Optional[List[str]] = None
lang = args[0]
@dataclass
class TTSVoiceMatch:
voice: TTSVoice
rank: int
for arg in args[1:]:
try:
key, val = arg.split("=")
except ValueError:
continue
key = key.strip()
val = val.strip().replace("_", " ")
if key == "voices":
voices = val.split(",")
class TTSPlayer:
default_rank = 0
_available_voices: Optional[List[TTSVoice]] = None
return TTSArgs(voices=voices or [], lang=lang)
def get_available_voices(self) -> List[TTSVoice]:
return []
def voices(self) -> List[TTSVoice]:
if self._available_voices is None:
self._available_voices = self.get_available_voices()
return self._available_voices
def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
avail_voices = self.voices()
rank = self.default_rank
# any requested voices match?
for requested_voice in tag.voices:
for avail in avail_voices:
if avail.name == requested_voice:
return TTSVoiceMatch(voice=avail, rank=rank)
rank -= 1
# if no preferred voices match, we fall back on language
# with a rank of -100
for avail in avail_voices:
if avail.lang == tag.lang:
return TTSVoiceMatch(voice=avail, rank=-100)
return None
class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
if not isinstance(tag, TTSTag):
return None
match = self.voice_for_tag(tag)
if match:
return match.rank
else:
return None
# Mac support
##########################################################################
@dataclass
class MacVoice:
name: str
lang: str
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
def parse_voice_line(line: str) -> Optional[MacVoice]:
m = VOICE_HELP_LINE_RE.match(line)
if not m:
return None
return MacVoice(name=m.group(1), lang=m.group(2))
class MacTTSPlayer(SimpleProcessPlayer):
def __init__(self, taskman: TaskManager):
super().__init__(taskman)
self._available_voices: Optional[List[MacVoice]] = None
class MacTTSPlayer(TTSProcessPlayer):
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
def _play(self, tag: AVTag) -> None:
ttag = cast(TTSTag, tag)
voice = self.voice_for_tag(ttag)
assert isinstance(tag, TTSTag)
match = self.voice_for_tag(tag)
assert match
voice = match.voice
self._process = subprocess.Popen(
["say", "-v", voice.name, "-f", "-"],
@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
stderr=subprocess.DEVNULL,
)
# write the input text to stdin
self._process.stdin.write(ttag.text.encode("utf8"))
self._process.stdin.write(tag.field_text.encode("utf8"))
self._process.stdin.close()
self._wait_for_termination()
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
if not isinstance(tag, TTSTag):
return None
# todo
return 0
def voices(self) -> List[MacVoice]:
if not self._available_voices:
def get_available_voices(self) -> List[TTSVoice]:
cmd = subprocess.run(
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
)
self._available_voices = []
voices = []
for line in cmd.stdout.splitlines():
voice = parse_voice_line(line)
voice = self._parse_voice_line(line)
if voice:
self._available_voices.append(voice)
voices.append(voice)
return voices
return self._available_voices
def voice_for_tag(self, tag: TTSTag) -> MacVoice:
args = TTSArgs.from_string(tag.args)
voices = self.voices()
# any requested voices match?
for requested_voice in args.voices:
avail_voice = next((x for x in voices if x.name == requested_voice), None)
if avail_voice:
return avail_voice
# requested language match?
avail_voice = next((x for x in voices if x.lang == args.lang), None)
if avail_voice:
return avail_voice
# fall back on first voice
return voices[0]
def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
m = self.VOICE_HELP_LINE_RE.match(line)
if not m:
return None
return TTSVoice(name=m.group(1), lang=m.group(2))

View file

@ -188,10 +188,17 @@ impl Backend {
AVTag::SoundOrVideo(file) => pt::AvTag {
value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
},
AVTag::TextToSpeech { args, field_text } => pt::AvTag {
AVTag::TextToSpeech {
field_text,
lang,
voices,
other_args,
} => pt::AvTag {
value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
args: args.iter().map(|&s| s.to_string()).collect(),
text: field_text.to_string(),
field_text: field_text.to_string(),
lang: lang.to_string(),
voices: voices.into_iter().map(ToOwned::to_owned).collect(),
other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
})),
},
})

View file

@ -12,8 +12,10 @@ use std::ptr;
pub enum AVTag<'a> {
SoundOrVideo(Cow<'a, str>),
TextToSpeech {
args: Vec<&'a str>,
field_text: Cow<'a, str>,
lang: &'a str,
voices: Vec<&'a str>,
other_args: Vec<&'a str>,
},
}
@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
} else {
let args = caps.get(2).unwrap();
let field_text = caps.get(3).unwrap();
AVTag::TextToSpeech {
args: args.as_str().split(' ').collect(),
field_text: strip_html_for_tts(field_text.as_str()),
}
tts_tag_from_string(field_text.as_str(), args.as_str())
}
})
}
fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
let mut other_args = vec![];
let mut split_args = args.split(' ');
let lang = split_args.next().unwrap_or("");
let mut voices = None;
for remaining_arg in split_args {
if remaining_arg.starts_with("voices=") {
voices = remaining_arg
.split('=')
.nth(1)
.map(|voices| voices.split(',').collect());
} else {
other_args.push(remaining_arg);
}
}
AVTag::TextToSpeech {
field_text: strip_html_for_tts(field_text),
lang,
voices: voices.unwrap_or_else(Vec::new),
other_args,
}
}
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
let without_html = HTML.replace_all(&without_fnames, "");
@ -153,15 +177,18 @@ mod test {
#[test]
fn test_audio() {
let s = "abc[sound:fo&amp;o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
let s =
"abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
assert_eq!(strip_av_tags(s), "abcdefgh");
assert_eq!(
av_tags_in_string(s).collect::<Vec<_>>(),
vec![
AVTag::SoundOrVideo("fo&o.mp3".into()),
AVTag::TextToSpeech {
args: vec!["lang=en_US", "voices=Bob,Jane"],
field_text: "foo 1>2".into()
field_text: "foo 1>2".into(),
lang: "en_US",
voices: vec!["Bob", "Jane"],
other_args: vec![]
},
]
);