diff --git a/proto/backend.proto b/proto/backend.proto index 2522639eb..33010736c 100644 --- a/proto/backend.proto +++ b/proto/backend.proto @@ -168,5 +168,6 @@ message TTSTag { string field_text = 1; string lang = 2; repeated string voices = 3; - repeated string other_args = 4; + float speed = 4; + repeated string other_args = 5; } diff --git a/pylib/anki/rsbackend.py b/pylib/anki/rsbackend.py index 00738588b..65fd89494 100644 --- a/pylib/anki/rsbackend.py +++ b/pylib/anki/rsbackend.py @@ -56,6 +56,7 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag: lang=tag.tts.lang, voices=list(tag.tts.voices), other_args=list(tag.tts.other_args), + speed=tag.tts.speed, ) diff --git a/pylib/anki/sound.py b/pylib/anki/sound.py index ca05a6c21..a3bcfe99d 100644 --- a/pylib/anki/sound.py +++ b/pylib/anki/sound.py @@ -24,6 +24,7 @@ class TTSTag: field_text: str lang: str voices: List[str] + speed: float # each arg should be in the form 'foo=bar' other_args: List[str] diff --git a/qt/aqt/tts.py b/qt/aqt/tts.py index 62adee8a3..e06e67a81 100644 --- a/qt/aqt/tts.py +++ b/qt/aqt/tts.py @@ -126,8 +126,11 @@ class MacTTSPlayer(TTSProcessPlayer): voice = match.voice assert isinstance(voice, MacVoice) + default_wpm = 170 + words_per_min = str(int(default_wpm * tag.speed)) + self._process = subprocess.Popen( - ["say", "-v", voice.original_name, "-f", "-"], + ["say", "-v", voice.original_name, "-r", words_per_min, "-f", "-"], stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -171,8 +174,21 @@ class MacTTSFilePlayer(MacTTSPlayer): voice = match.voice assert isinstance(voice, MacVoice) + default_wpm = 170 + words_per_min = str(int(default_wpm * tag.speed)) + self._process = subprocess.Popen( - ["say", "-v", voice.original_name, "-f", "-", "-o", self.tmppath], + [ + "say", + "-v", + voice.original_name, + "-r", + words_per_min, + "-f", + "-", + "-o", + self.tmppath, + ], stdin=subprocess.PIPE, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, @@ -441,6 +457,7 @@ if isWin: try: native_voice = voice.handle self.speaker.Voice = native_voice + self.speaker.Rate = self._rate_for_speed(tag.speed) self.speaker.Speak(tag.field_text, 1) gui_hooks.av_player_did_begin_playing(self, tag) @@ -454,7 +471,12 @@ if isWin: self._terminate_flag = False def _tidy_name(self, name: str) -> str: - "eg. Microsoft Haruka Desktop -> MS-Haruka." + "eg. Microsoft Haruka Desktop -> Microsoft-Haruka." return re.sub(r"^Microsoft (.+) Desktop$", "Microsoft_\\1", name).replace( " ", "_" ) + + def _rate_for_speed(self, speed: float) -> int: + "eg. 1.5 -> 15, 0.5 -> -5" + speed = (speed * 10) - 10 + return int(max(-10, min(10, speed))) diff --git a/rslib/src/backend.rs b/rslib/src/backend.rs index 2df96bd9f..a14ef4d14 100644 --- a/rslib/src/backend.rs +++ b/rslib/src/backend.rs @@ -195,12 +195,14 @@ impl Backend { lang, voices, other_args, + speed, } => pt::AvTag { value: Some(pt::av_tag::Value::Tts(pt::TtsTag { field_text, lang, voices, other_args, + speed, })), }, }) diff --git a/rslib/src/text.rs b/rslib/src/text.rs index 154638c25..f0a97a192 100644 --- a/rslib/src/text.rs +++ b/rslib/src/text.rs @@ -15,6 +15,7 @@ pub enum AVTag { field_text: String, lang: String, voices: Vec, + speed: f32, other_args: Vec, }, } @@ -102,9 +103,10 @@ pub fn extract_av_tags<'a>(text: &'a str, question_side: bool) -> (Cow<'a, str>, fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag { let mut other_args = vec![]; - let mut split_args = args.split(' '); + let mut split_args = args.split_ascii_whitespace(); let lang = split_args.next().unwrap_or(""); let mut voices = None; + let mut speed = 1.0; for remaining_arg in split_args { if remaining_arg.starts_with("voices=") { @@ -112,6 +114,13 @@ fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag { .split('=') .nth(1) .map(|voices| voices.split(',').map(ToOwned::to_owned).collect()); + } else if remaining_arg.starts_with("speed=") { + speed = remaining_arg + .split('=') + .nth(1) + .unwrap() + .parse() + .unwrap_or(1.0); } else { other_args.push(remaining_arg.to_owned()); } @@ -121,6 +130,7 @@ fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag { field_text: strip_html_for_tts(field_text).into(), lang: lang.into(), voices: voices.unwrap_or_else(Vec::new), + speed, other_args, } } @@ -188,7 +198,7 @@ mod test { #[test] fn test_audio() { let s = - "abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo
1>2[/anki:tts]gh"; + "abc[sound:fo&o.mp3]def[anki:tts][en_US voices=Bob,Jane speed=1.2]foo
1>2[/anki:tts]gh"; assert_eq!(strip_av_tags(s), "abcdefgh"); let (text, tags) = extract_av_tags(s, true); @@ -202,7 +212,8 @@ mod test { field_text: "foo 1>2".into(), lang: "en_US".into(), voices: vec!["Bob".into(), "Jane".into()], - other_args: vec![] + other_args: vec![], + speed: 1.2 }, ] );