mirror of
https://github.com/ankitects/anki.git
synced 2025-12-20 10:22:57 -05:00
add rank handling to TTS; parse TTS args in get_av_tags()
This commit is contained in:
parent
66e277e44b
commit
c713683f63
7 changed files with 145 additions and 100 deletions
|
|
@ -159,6 +159,8 @@ message AVTag {
|
|||
}
|
||||
|
||||
message TTSTag {
|
||||
repeated string args = 1;
|
||||
string text = 2;
|
||||
string field_text = 1;
|
||||
string lang = 2;
|
||||
repeated string voices = 3;
|
||||
repeated string other_args = 4;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
|
|||
if val == "sound_or_video":
|
||||
return SoundOrVideoTag(filename=tag.sound_or_video)
|
||||
else:
|
||||
return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
|
||||
return TTSTag(
|
||||
field_text=tag.tts.field_text,
|
||||
lang=tag.tts.lang,
|
||||
voices=list(tag.tts.voices),
|
||||
other_args=list(tag.tts.other_args),
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -21,8 +21,11 @@ class TTSTag:
|
|||
See tts.py for more information.
|
||||
"""
|
||||
|
||||
args: List[str]
|
||||
text: str
|
||||
field_text: str
|
||||
lang: str
|
||||
voices: List[str]
|
||||
# each arg should be in the form 'foo=bar'
|
||||
other_args: List[str]
|
||||
|
||||
|
||||
@dataclass
|
||||
|
|
|
|||
|
|
@ -166,7 +166,6 @@ class AVPlayer:
|
|||
|
||||
ranked.sort()
|
||||
|
||||
print(ranked)
|
||||
if ranked:
|
||||
return ranked[-1][1]
|
||||
else:
|
||||
|
|
|
|||
168
qt/aqt/tts.py
168
qt/aqt/tts.py
|
|
@ -9,10 +9,18 @@ or
|
|||
|
||||
{{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
|
||||
|
||||
The first argument must be a language code. If provided,
|
||||
voices is a comma-separated list of one or more voices that
|
||||
the user would prefer. Spaces must not be included.
|
||||
Underscores will be converted to spaces.
|
||||
The first argument must be a language code.
|
||||
|
||||
If provided, voices is a comma-separated list of one or more voices that
|
||||
the user would prefer. Spaces must not be included. Underscores will be
|
||||
converted to spaces.
|
||||
|
||||
AVPlayer decides which TTSPlayer to use based on the returned rank.
|
||||
In the default implementation, the TTS player is chosen based on the order
|
||||
of voices the user has specified. When adding new TTS players, your code
|
||||
can either expose the underlying names the TTS engine provides, or simply
|
||||
expose the name of the engine, which would mean the user could write
|
||||
{{tts en_AU voices=MyEngine}} to prioritize your engine.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
|
@ -20,68 +28,82 @@ from __future__ import annotations
|
|||
import re
|
||||
import subprocess
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, cast
|
||||
from typing import List, Optional
|
||||
|
||||
from anki.sound import AVTag, TTSTag
|
||||
from aqt.sound import SimpleProcessPlayer
|
||||
from aqt.taskman import TaskManager
|
||||
|
||||
|
||||
@dataclass
|
||||
class TTSArgs:
|
||||
# requested language
|
||||
class TTSVoice:
|
||||
name: str
|
||||
lang: str
|
||||
# preferred voices, will use first available if possible
|
||||
voices: List[str]
|
||||
|
||||
@classmethod
|
||||
def from_string(cls, args: List[str]) -> TTSArgs:
|
||||
voices: Optional[List[str]] = None
|
||||
|
||||
lang = args[0]
|
||||
@dataclass
|
||||
class TTSVoiceMatch:
|
||||
voice: TTSVoice
|
||||
rank: int
|
||||
|
||||
for arg in args[1:]:
|
||||
try:
|
||||
key, val = arg.split("=")
|
||||
except ValueError:
|
||||
continue
|
||||
key = key.strip()
|
||||
val = val.strip().replace("_", " ")
|
||||
|
||||
if key == "voices":
|
||||
voices = val.split(",")
|
||||
class TTSPlayer:
|
||||
default_rank = 0
|
||||
_available_voices: Optional[List[TTSVoice]] = None
|
||||
|
||||
return TTSArgs(voices=voices or [], lang=lang)
|
||||
def get_available_voices(self) -> List[TTSVoice]:
|
||||
return []
|
||||
|
||||
def voices(self) -> List[TTSVoice]:
|
||||
if self._available_voices is None:
|
||||
self._available_voices = self.get_available_voices()
|
||||
return self._available_voices
|
||||
|
||||
def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
|
||||
avail_voices = self.voices()
|
||||
|
||||
rank = self.default_rank
|
||||
|
||||
# any requested voices match?
|
||||
for requested_voice in tag.voices:
|
||||
for avail in avail_voices:
|
||||
if avail.name == requested_voice:
|
||||
return TTSVoiceMatch(voice=avail, rank=rank)
|
||||
|
||||
rank -= 1
|
||||
|
||||
# if no preferred voices match, we fall back on language
|
||||
# with a rank of -100
|
||||
for avail in avail_voices:
|
||||
if avail.lang == tag.lang:
|
||||
return TTSVoiceMatch(voice=avail, rank=-100)
|
||||
|
||||
return None
|
||||
|
||||
|
||||
class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
|
||||
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
|
||||
if not isinstance(tag, TTSTag):
|
||||
return None
|
||||
|
||||
match = self.voice_for_tag(tag)
|
||||
if match:
|
||||
return match.rank
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
# Mac support
|
||||
##########################################################################
|
||||
|
||||
|
||||
@dataclass
|
||||
class MacVoice:
|
||||
name: str
|
||||
lang: str
|
||||
|
||||
|
||||
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
|
||||
|
||||
|
||||
def parse_voice_line(line: str) -> Optional[MacVoice]:
|
||||
m = VOICE_HELP_LINE_RE.match(line)
|
||||
if not m:
|
||||
return None
|
||||
return MacVoice(name=m.group(1), lang=m.group(2))
|
||||
|
||||
|
||||
class MacTTSPlayer(SimpleProcessPlayer):
|
||||
def __init__(self, taskman: TaskManager):
|
||||
super().__init__(taskman)
|
||||
self._available_voices: Optional[List[MacVoice]] = None
|
||||
class MacTTSPlayer(TTSProcessPlayer):
|
||||
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
|
||||
|
||||
def _play(self, tag: AVTag) -> None:
|
||||
ttag = cast(TTSTag, tag)
|
||||
voice = self.voice_for_tag(ttag)
|
||||
assert isinstance(tag, TTSTag)
|
||||
match = self.voice_for_tag(tag)
|
||||
assert match
|
||||
voice = match.voice
|
||||
|
||||
self._process = subprocess.Popen(
|
||||
["say", "-v", voice.name, "-f", "-"],
|
||||
|
|
@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
|
|||
stderr=subprocess.DEVNULL,
|
||||
)
|
||||
# write the input text to stdin
|
||||
self._process.stdin.write(ttag.text.encode("utf8"))
|
||||
self._process.stdin.write(tag.field_text.encode("utf8"))
|
||||
self._process.stdin.close()
|
||||
|
||||
self._wait_for_termination()
|
||||
|
||||
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
|
||||
if not isinstance(tag, TTSTag):
|
||||
def get_available_voices(self) -> List[TTSVoice]:
|
||||
cmd = subprocess.run(
|
||||
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
|
||||
)
|
||||
|
||||
voices = []
|
||||
for line in cmd.stdout.splitlines():
|
||||
voice = self._parse_voice_line(line)
|
||||
if voice:
|
||||
voices.append(voice)
|
||||
return voices
|
||||
|
||||
def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
|
||||
m = self.VOICE_HELP_LINE_RE.match(line)
|
||||
if not m:
|
||||
return None
|
||||
|
||||
# todo
|
||||
return 0
|
||||
|
||||
def voices(self) -> List[MacVoice]:
|
||||
if not self._available_voices:
|
||||
cmd = subprocess.run(
|
||||
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
|
||||
)
|
||||
self._available_voices = []
|
||||
for line in cmd.stdout.splitlines():
|
||||
voice = parse_voice_line(line)
|
||||
if voice:
|
||||
self._available_voices.append(voice)
|
||||
|
||||
return self._available_voices
|
||||
|
||||
def voice_for_tag(self, tag: TTSTag) -> MacVoice:
|
||||
args = TTSArgs.from_string(tag.args)
|
||||
voices = self.voices()
|
||||
|
||||
# any requested voices match?
|
||||
for requested_voice in args.voices:
|
||||
avail_voice = next((x for x in voices if x.name == requested_voice), None)
|
||||
if avail_voice:
|
||||
return avail_voice
|
||||
|
||||
# requested language match?
|
||||
avail_voice = next((x for x in voices if x.lang == args.lang), None)
|
||||
if avail_voice:
|
||||
return avail_voice
|
||||
|
||||
# fall back on first voice
|
||||
return voices[0]
|
||||
return TTSVoice(name=m.group(1), lang=m.group(2))
|
||||
|
|
|
|||
|
|
@ -188,10 +188,17 @@ impl Backend {
|
|||
AVTag::SoundOrVideo(file) => pt::AvTag {
|
||||
value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
|
||||
},
|
||||
AVTag::TextToSpeech { args, field_text } => pt::AvTag {
|
||||
AVTag::TextToSpeech {
|
||||
field_text,
|
||||
lang,
|
||||
voices,
|
||||
other_args,
|
||||
} => pt::AvTag {
|
||||
value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
|
||||
args: args.iter().map(|&s| s.to_string()).collect(),
|
||||
text: field_text.to_string(),
|
||||
field_text: field_text.to_string(),
|
||||
lang: lang.to_string(),
|
||||
voices: voices.into_iter().map(ToOwned::to_owned).collect(),
|
||||
other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
|
||||
})),
|
||||
},
|
||||
})
|
||||
|
|
|
|||
|
|
@ -12,8 +12,10 @@ use std::ptr;
|
|||
pub enum AVTag<'a> {
|
||||
SoundOrVideo(Cow<'a, str>),
|
||||
TextToSpeech {
|
||||
args: Vec<&'a str>,
|
||||
field_text: Cow<'a, str>,
|
||||
lang: &'a str,
|
||||
voices: Vec<&'a str>,
|
||||
other_args: Vec<&'a str>,
|
||||
},
|
||||
}
|
||||
|
||||
|
|
@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
|
|||
} else {
|
||||
let args = caps.get(2).unwrap();
|
||||
let field_text = caps.get(3).unwrap();
|
||||
AVTag::TextToSpeech {
|
||||
args: args.as_str().split(' ').collect(),
|
||||
field_text: strip_html_for_tts(field_text.as_str()),
|
||||
}
|
||||
tts_tag_from_string(field_text.as_str(), args.as_str())
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
|
||||
let mut other_args = vec![];
|
||||
let mut split_args = args.split(' ');
|
||||
let lang = split_args.next().unwrap_or("");
|
||||
let mut voices = None;
|
||||
|
||||
for remaining_arg in split_args {
|
||||
if remaining_arg.starts_with("voices=") {
|
||||
voices = remaining_arg
|
||||
.split('=')
|
||||
.nth(1)
|
||||
.map(|voices| voices.split(',').collect());
|
||||
} else {
|
||||
other_args.push(remaining_arg);
|
||||
}
|
||||
}
|
||||
|
||||
AVTag::TextToSpeech {
|
||||
field_text: strip_html_for_tts(field_text),
|
||||
lang,
|
||||
voices: voices.unwrap_or_else(Vec::new),
|
||||
other_args,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
|
||||
let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
|
||||
let without_html = HTML.replace_all(&without_fnames, "");
|
||||
|
|
@ -153,15 +177,18 @@ mod test {
|
|||
|
||||
#[test]
|
||||
fn test_audio() {
|
||||
let s = "abc[sound:fo&o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1>2[/anki:tts]gh";
|
||||
let s =
|
||||
"abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1>2[/anki:tts]gh";
|
||||
assert_eq!(strip_av_tags(s), "abcdefgh");
|
||||
assert_eq!(
|
||||
av_tags_in_string(s).collect::<Vec<_>>(),
|
||||
vec![
|
||||
AVTag::SoundOrVideo("fo&o.mp3".into()),
|
||||
AVTag::TextToSpeech {
|
||||
args: vec!["lang=en_US", "voices=Bob,Jane"],
|
||||
field_text: "foo 1>2".into()
|
||||
field_text: "foo 1>2".into(),
|
||||
lang: "en_US",
|
||||
voices: vec!["Bob", "Jane"],
|
||||
other_args: vec![]
|
||||
},
|
||||
]
|
||||
);
|
||||
|
|
|
|||
Loading…
Reference in a new issue