mirror of
https://github.com/ankitects/anki.git
synced 2025-12-20 10:22:57 -05:00
add rank handling to TTS; parse TTS args in get_av_tags()
This commit is contained in:
parent
66e277e44b
commit
c713683f63
7 changed files with 145 additions and 100 deletions
|
|
@ -159,6 +159,8 @@ message AVTag {
|
||||||
}
|
}
|
||||||
|
|
||||||
message TTSTag {
|
message TTSTag {
|
||||||
repeated string args = 1;
|
string field_text = 1;
|
||||||
string text = 2;
|
string lang = 2;
|
||||||
|
repeated string voices = 3;
|
||||||
|
repeated string other_args = 4;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -51,7 +51,12 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
|
||||||
if val == "sound_or_video":
|
if val == "sound_or_video":
|
||||||
return SoundOrVideoTag(filename=tag.sound_or_video)
|
return SoundOrVideoTag(filename=tag.sound_or_video)
|
||||||
else:
|
else:
|
||||||
return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
|
return TTSTag(
|
||||||
|
field_text=tag.tts.field_text,
|
||||||
|
lang=tag.tts.lang,
|
||||||
|
voices=list(tag.tts.voices),
|
||||||
|
other_args=list(tag.tts.other_args),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
|
|
@ -21,8 +21,11 @@ class TTSTag:
|
||||||
See tts.py for more information.
|
See tts.py for more information.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
args: List[str]
|
field_text: str
|
||||||
text: str
|
lang: str
|
||||||
|
voices: List[str]
|
||||||
|
# each arg should be in the form 'foo=bar'
|
||||||
|
other_args: List[str]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|
|
||||||
|
|
@ -166,7 +166,6 @@ class AVPlayer:
|
||||||
|
|
||||||
ranked.sort()
|
ranked.sort()
|
||||||
|
|
||||||
print(ranked)
|
|
||||||
if ranked:
|
if ranked:
|
||||||
return ranked[-1][1]
|
return ranked[-1][1]
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
158
qt/aqt/tts.py
158
qt/aqt/tts.py
|
|
@ -9,10 +9,18 @@ or
|
||||||
|
|
||||||
{{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
|
{{tts ja_JP voices=Kyoko,Otoya,Another_name:Field}}
|
||||||
|
|
||||||
The first argument must be a language code. If provided,
|
The first argument must be a language code.
|
||||||
voices is a comma-separated list of one or more voices that
|
|
||||||
the user would prefer. Spaces must not be included.
|
If provided, voices is a comma-separated list of one or more voices that
|
||||||
Underscores will be converted to spaces.
|
the user would prefer. Spaces must not be included. Underscores will be
|
||||||
|
converted to spaces.
|
||||||
|
|
||||||
|
AVPlayer decides which TTSPlayer to use based on the returned rank.
|
||||||
|
In the default implementation, the TTS player is chosen based on the order
|
||||||
|
of voices the user has specified. When adding new TTS players, your code
|
||||||
|
can either expose the underlying names the TTS engine provides, or simply
|
||||||
|
expose the name of the engine, which would mean the user could write
|
||||||
|
{{tts en_AU voices=MyEngine}} to prioritize your engine.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
@ -20,68 +28,82 @@ from __future__ import annotations
|
||||||
import re
|
import re
|
||||||
import subprocess
|
import subprocess
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import List, Optional, cast
|
from typing import List, Optional
|
||||||
|
|
||||||
from anki.sound import AVTag, TTSTag
|
from anki.sound import AVTag, TTSTag
|
||||||
from aqt.sound import SimpleProcessPlayer
|
from aqt.sound import SimpleProcessPlayer
|
||||||
from aqt.taskman import TaskManager
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TTSArgs:
|
class TTSVoice:
|
||||||
# requested language
|
name: str
|
||||||
lang: str
|
lang: str
|
||||||
# preferred voices, will use first available if possible
|
|
||||||
voices: List[str]
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_string(cls, args: List[str]) -> TTSArgs:
|
|
||||||
voices: Optional[List[str]] = None
|
|
||||||
|
|
||||||
lang = args[0]
|
@dataclass
|
||||||
|
class TTSVoiceMatch:
|
||||||
|
voice: TTSVoice
|
||||||
|
rank: int
|
||||||
|
|
||||||
for arg in args[1:]:
|
|
||||||
try:
|
|
||||||
key, val = arg.split("=")
|
|
||||||
except ValueError:
|
|
||||||
continue
|
|
||||||
key = key.strip()
|
|
||||||
val = val.strip().replace("_", " ")
|
|
||||||
|
|
||||||
if key == "voices":
|
class TTSPlayer:
|
||||||
voices = val.split(",")
|
default_rank = 0
|
||||||
|
_available_voices: Optional[List[TTSVoice]] = None
|
||||||
|
|
||||||
return TTSArgs(voices=voices or [], lang=lang)
|
def get_available_voices(self) -> List[TTSVoice]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def voices(self) -> List[TTSVoice]:
|
||||||
|
if self._available_voices is None:
|
||||||
|
self._available_voices = self.get_available_voices()
|
||||||
|
return self._available_voices
|
||||||
|
|
||||||
|
def voice_for_tag(self, tag: TTSTag) -> Optional[TTSVoiceMatch]:
|
||||||
|
avail_voices = self.voices()
|
||||||
|
|
||||||
|
rank = self.default_rank
|
||||||
|
|
||||||
|
# any requested voices match?
|
||||||
|
for requested_voice in tag.voices:
|
||||||
|
for avail in avail_voices:
|
||||||
|
if avail.name == requested_voice:
|
||||||
|
return TTSVoiceMatch(voice=avail, rank=rank)
|
||||||
|
|
||||||
|
rank -= 1
|
||||||
|
|
||||||
|
# if no preferred voices match, we fall back on language
|
||||||
|
# with a rank of -100
|
||||||
|
for avail in avail_voices:
|
||||||
|
if avail.lang == tag.lang:
|
||||||
|
return TTSVoiceMatch(voice=avail, rank=-100)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class TTSProcessPlayer(SimpleProcessPlayer, TTSPlayer):
|
||||||
|
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
|
||||||
|
if not isinstance(tag, TTSTag):
|
||||||
|
return None
|
||||||
|
|
||||||
|
match = self.voice_for_tag(tag)
|
||||||
|
if match:
|
||||||
|
return match.rank
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Mac support
|
# Mac support
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
class MacTTSPlayer(TTSProcessPlayer):
|
||||||
class MacVoice:
|
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
|
||||||
name: str
|
|
||||||
lang: str
|
|
||||||
|
|
||||||
|
|
||||||
VOICE_HELP_LINE_RE = re.compile(r"^(\S+)\s+(\S+)\s+.*$")
|
|
||||||
|
|
||||||
|
|
||||||
def parse_voice_line(line: str) -> Optional[MacVoice]:
|
|
||||||
m = VOICE_HELP_LINE_RE.match(line)
|
|
||||||
if not m:
|
|
||||||
return None
|
|
||||||
return MacVoice(name=m.group(1), lang=m.group(2))
|
|
||||||
|
|
||||||
|
|
||||||
class MacTTSPlayer(SimpleProcessPlayer):
|
|
||||||
def __init__(self, taskman: TaskManager):
|
|
||||||
super().__init__(taskman)
|
|
||||||
self._available_voices: Optional[List[MacVoice]] = None
|
|
||||||
|
|
||||||
def _play(self, tag: AVTag) -> None:
|
def _play(self, tag: AVTag) -> None:
|
||||||
ttag = cast(TTSTag, tag)
|
assert isinstance(tag, TTSTag)
|
||||||
voice = self.voice_for_tag(ttag)
|
match = self.voice_for_tag(tag)
|
||||||
|
assert match
|
||||||
|
voice = match.voice
|
||||||
|
|
||||||
self._process = subprocess.Popen(
|
self._process = subprocess.Popen(
|
||||||
["say", "-v", voice.name, "-f", "-"],
|
["say", "-v", voice.name, "-f", "-"],
|
||||||
|
|
@ -90,45 +112,25 @@ class MacTTSPlayer(SimpleProcessPlayer):
|
||||||
stderr=subprocess.DEVNULL,
|
stderr=subprocess.DEVNULL,
|
||||||
)
|
)
|
||||||
# write the input text to stdin
|
# write the input text to stdin
|
||||||
self._process.stdin.write(ttag.text.encode("utf8"))
|
self._process.stdin.write(tag.field_text.encode("utf8"))
|
||||||
self._process.stdin.close()
|
self._process.stdin.close()
|
||||||
|
|
||||||
self._wait_for_termination()
|
self._wait_for_termination()
|
||||||
|
|
||||||
def rank_for_tag(self, tag: AVTag) -> Optional[int]:
|
def get_available_voices(self) -> List[TTSVoice]:
|
||||||
if not isinstance(tag, TTSTag):
|
|
||||||
return None
|
|
||||||
|
|
||||||
# todo
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def voices(self) -> List[MacVoice]:
|
|
||||||
if not self._available_voices:
|
|
||||||
cmd = subprocess.run(
|
cmd = subprocess.run(
|
||||||
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
|
["say", "-v", "?"], capture_output=True, check=True, encoding="utf8"
|
||||||
)
|
)
|
||||||
self._available_voices = []
|
|
||||||
|
voices = []
|
||||||
for line in cmd.stdout.splitlines():
|
for line in cmd.stdout.splitlines():
|
||||||
voice = parse_voice_line(line)
|
voice = self._parse_voice_line(line)
|
||||||
if voice:
|
if voice:
|
||||||
self._available_voices.append(voice)
|
voices.append(voice)
|
||||||
|
return voices
|
||||||
|
|
||||||
return self._available_voices
|
def _parse_voice_line(self, line: str) -> Optional[TTSVoice]:
|
||||||
|
m = self.VOICE_HELP_LINE_RE.match(line)
|
||||||
def voice_for_tag(self, tag: TTSTag) -> MacVoice:
|
if not m:
|
||||||
args = TTSArgs.from_string(tag.args)
|
return None
|
||||||
voices = self.voices()
|
return TTSVoice(name=m.group(1), lang=m.group(2))
|
||||||
|
|
||||||
# any requested voices match?
|
|
||||||
for requested_voice in args.voices:
|
|
||||||
avail_voice = next((x for x in voices if x.name == requested_voice), None)
|
|
||||||
if avail_voice:
|
|
||||||
return avail_voice
|
|
||||||
|
|
||||||
# requested language match?
|
|
||||||
avail_voice = next((x for x in voices if x.lang == args.lang), None)
|
|
||||||
if avail_voice:
|
|
||||||
return avail_voice
|
|
||||||
|
|
||||||
# fall back on first voice
|
|
||||||
return voices[0]
|
|
||||||
|
|
|
||||||
|
|
@ -188,10 +188,17 @@ impl Backend {
|
||||||
AVTag::SoundOrVideo(file) => pt::AvTag {
|
AVTag::SoundOrVideo(file) => pt::AvTag {
|
||||||
value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
|
value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
|
||||||
},
|
},
|
||||||
AVTag::TextToSpeech { args, field_text } => pt::AvTag {
|
AVTag::TextToSpeech {
|
||||||
|
field_text,
|
||||||
|
lang,
|
||||||
|
voices,
|
||||||
|
other_args,
|
||||||
|
} => pt::AvTag {
|
||||||
value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
|
value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
|
||||||
args: args.iter().map(|&s| s.to_string()).collect(),
|
field_text: field_text.to_string(),
|
||||||
text: field_text.to_string(),
|
lang: lang.to_string(),
|
||||||
|
voices: voices.into_iter().map(ToOwned::to_owned).collect(),
|
||||||
|
other_args: other_args.into_iter().map(ToOwned::to_owned).collect(),
|
||||||
})),
|
})),
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
|
||||||
|
|
@ -12,8 +12,10 @@ use std::ptr;
|
||||||
pub enum AVTag<'a> {
|
pub enum AVTag<'a> {
|
||||||
SoundOrVideo(Cow<'a, str>),
|
SoundOrVideo(Cow<'a, str>),
|
||||||
TextToSpeech {
|
TextToSpeech {
|
||||||
args: Vec<&'a str>,
|
|
||||||
field_text: Cow<'a, str>,
|
field_text: Cow<'a, str>,
|
||||||
|
lang: &'a str,
|
||||||
|
voices: Vec<&'a str>,
|
||||||
|
other_args: Vec<&'a str>,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -83,14 +85,36 @@ pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
|
||||||
} else {
|
} else {
|
||||||
let args = caps.get(2).unwrap();
|
let args = caps.get(2).unwrap();
|
||||||
let field_text = caps.get(3).unwrap();
|
let field_text = caps.get(3).unwrap();
|
||||||
AVTag::TextToSpeech {
|
tts_tag_from_string(field_text.as_str(), args.as_str())
|
||||||
args: args.as_str().split(' ').collect(),
|
|
||||||
field_text: strip_html_for_tts(field_text.as_str()),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag<'a> {
|
||||||
|
let mut other_args = vec![];
|
||||||
|
let mut split_args = args.split(' ');
|
||||||
|
let lang = split_args.next().unwrap_or("");
|
||||||
|
let mut voices = None;
|
||||||
|
|
||||||
|
for remaining_arg in split_args {
|
||||||
|
if remaining_arg.starts_with("voices=") {
|
||||||
|
voices = remaining_arg
|
||||||
|
.split('=')
|
||||||
|
.nth(1)
|
||||||
|
.map(|voices| voices.split(',').collect());
|
||||||
|
} else {
|
||||||
|
other_args.push(remaining_arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
AVTag::TextToSpeech {
|
||||||
|
field_text: strip_html_for_tts(field_text),
|
||||||
|
lang,
|
||||||
|
voices: voices.unwrap_or_else(Vec::new),
|
||||||
|
other_args,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
|
pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
|
||||||
let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
|
let without_fnames = IMG_TAG.replace_all(html, r" $1 ");
|
||||||
let without_html = HTML.replace_all(&without_fnames, "");
|
let without_html = HTML.replace_all(&without_fnames, "");
|
||||||
|
|
@ -153,15 +177,18 @@ mod test {
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_audio() {
|
fn test_audio() {
|
||||||
let s = "abc[sound:fo&o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1>2[/anki:tts]gh";
|
let s =
|
||||||
|
"abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1>2[/anki:tts]gh";
|
||||||
assert_eq!(strip_av_tags(s), "abcdefgh");
|
assert_eq!(strip_av_tags(s), "abcdefgh");
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
av_tags_in_string(s).collect::<Vec<_>>(),
|
av_tags_in_string(s).collect::<Vec<_>>(),
|
||||||
vec![
|
vec![
|
||||||
AVTag::SoundOrVideo("fo&o.mp3".into()),
|
AVTag::SoundOrVideo("fo&o.mp3".into()),
|
||||||
AVTag::TextToSpeech {
|
AVTag::TextToSpeech {
|
||||||
args: vec!["lang=en_US", "voices=Bob,Jane"],
|
field_text: "foo 1>2".into(),
|
||||||
field_text: "foo 1>2".into()
|
lang: "en_US",
|
||||||
|
voices: vec!["Bob", "Jane"],
|
||||||
|
other_args: vec![]
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
);
|
);
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue