support speed control in tts tags

2025-12-24 04:12:57 -05:00 · 2020-01-26 14:28:17 +10:00 · 2020-01-26 14:28:17 +10:00 · 21cbb5a766
commit 21cbb5a766
parent 0480879c11
6 changed files with 45 additions and 7 deletions
--- a/proto/backend.proto
+++ b/proto/backend.proto
@ -168,5 +168,6 @@ message TTSTag {
    string field_text = 1;
    string lang = 2;
    repeated string voices = 3;
-    repeated string other_args = 4;
+    float speed = 4;
+    repeated string other_args = 5;
 }
--- a/pylib/anki/rsbackend.py
+++ b/pylib/anki/rsbackend.py
@ -56,6 +56,7 @@ def av_tag_to_native(tag: pb.AVTag) -> AVTag:
            lang=tag.tts.lang,
            voices=list(tag.tts.voices),
            other_args=list(tag.tts.other_args),
+            speed=tag.tts.speed,
        )


--- a/pylib/anki/sound.py
+++ b/pylib/anki/sound.py
@ -24,6 +24,7 @@ class TTSTag:
    field_text: str
    lang: str
    voices: List[str]
+    speed: float
    # each arg should be in the form 'foo=bar'
    other_args: List[str]

--- a/qt/aqt/tts.py
+++ b/qt/aqt/tts.py
@ -126,8 +126,11 @@ class MacTTSPlayer(TTSProcessPlayer):
        voice = match.voice
        assert isinstance(voice, MacVoice)

+        default_wpm = 170
+        words_per_min = str(int(default_wpm * tag.speed))
+
        self._process = subprocess.Popen(
-            ["say", "-v", voice.original_name, "-f", "-"],
+            ["say", "-v", voice.original_name, "-r", words_per_min, "-f", "-"],
            stdin=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
@ -171,8 +174,21 @@ class MacTTSFilePlayer(MacTTSPlayer):
        voice = match.voice
        assert isinstance(voice, MacVoice)

+        default_wpm = 170
+        words_per_min = str(int(default_wpm * tag.speed))
+
        self._process = subprocess.Popen(
-            ["say", "-v", voice.original_name, "-f", "-", "-o", self.tmppath],
+            [
+                "say",
+                "-v",
+                voice.original_name,
+                "-r",
+                words_per_min,
+                "-f",
+                "-",
+                "-o",
+                self.tmppath,
+            ],
            stdin=subprocess.PIPE,
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL,
@ -441,6 +457,7 @@ if isWin:
            try:
                native_voice = voice.handle
                self.speaker.Voice = native_voice
+                self.speaker.Rate = self._rate_for_speed(tag.speed)
                self.speaker.Speak(tag.field_text, 1)
                gui_hooks.av_player_did_begin_playing(self, tag)

@ -454,7 +471,12 @@ if isWin:
                self._terminate_flag = False

        def _tidy_name(self, name: str) -> str:
-            "eg. Microsoft Haruka Desktop -> MS-Haruka."
+            "eg. Microsoft Haruka Desktop -> Microsoft-Haruka."
            return re.sub(r"^Microsoft (.+) Desktop$", "Microsoft_\\1", name).replace(
                " ", "_"
            )
+
+        def _rate_for_speed(self, speed: float) -> int:
+            "eg. 1.5 -> 15, 0.5 -> -5"
+            speed = (speed * 10) - 10
+            return int(max(-10, min(10, speed)))
--- a/rslib/src/backend.rs
+++ b/rslib/src/backend.rs
@ -195,12 +195,14 @@ impl Backend {
                    lang,
                    voices,
                    other_args,
+                    speed,
                } => pt::AvTag {
                    value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
                        field_text,
                        lang,
                        voices,
                        other_args,
+                        speed,
                    })),
                },
            })
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@ -15,6 +15,7 @@ pub enum AVTag {
        field_text: String,
        lang: String,
        voices: Vec<String>,
+        speed: f32,
        other_args: Vec<String>,
    },
 }
@ -102,9 +103,10 @@ pub fn extract_av_tags<'a>(text: &'a str, question_side: bool) -> (Cow<'a, str>,

 fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag {
    let mut other_args = vec![];
-    let mut split_args = args.split(' ');
+    let mut split_args = args.split_ascii_whitespace();
    let lang = split_args.next().unwrap_or("");
    let mut voices = None;
+    let mut speed = 1.0;

    for remaining_arg in split_args {
        if remaining_arg.starts_with("voices=") {
@ -112,6 +114,13 @@ fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag {
                .split('=')
                .nth(1)
                .map(|voices| voices.split(',').map(ToOwned::to_owned).collect());
+        } else if remaining_arg.starts_with("speed=") {
+            speed = remaining_arg
+                .split('=')
+                .nth(1)
+                .unwrap()
+                .parse()
+                .unwrap_or(1.0);
        } else {
            other_args.push(remaining_arg.to_owned());
        }
@ -121,6 +130,7 @@ fn tts_tag_from_string<'a>(field_text: &'a str, args: &'a str) -> AVTag {
        field_text: strip_html_for_tts(field_text).into(),
        lang: lang.into(),
        voices: voices.unwrap_or_else(Vec::new),
+        speed,
        other_args,
    }
 }
@ -188,7 +198,7 @@ mod test {
    #[test]
    fn test_audio() {
        let s =
-            "abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
+            "abc[sound:fo&amp;o.mp3]def[anki:tts][en_US voices=Bob,Jane speed=1.2]foo<br>1&gt;2[/anki:tts]gh";
        assert_eq!(strip_av_tags(s), "abcdefgh");

        let (text, tags) = extract_av_tags(s, true);
@ -202,7 +212,8 @@ mod test {
                    field_text: "foo 1>2".into(),
                    lang: "en_US".into(),
                    voices: vec!["Bob".into(), "Jane".into()],
-                    other_args: vec![]
+                    other_args: vec![],
+                    speed: 1.2
                },
            ]
        );