diff --git a/proto/backend.proto b/proto/backend.proto
index 38e070ceb..1c8b0a1cb 100644
--- a/proto/backend.proto
+++ b/proto/backend.proto
@@ -16,6 +16,8 @@ message BackendInput {
         BrowserRowsIn browser_rows = 20;
         RenderCardIn render_card = 21;
         int64 local_minutes_west = 22;
+        string strip_av_tags = 23;
+        string get_av_tags = 24;
     }
 }
 
@@ -28,6 +30,8 @@ message BackendOutput {
         BrowserRowsOut browser_rows = 20;
         RenderCardOut render_card = 21;
         sint32 local_minutes_west = 22;
+        string strip_av_tags = 23;
+        GetAVTagsOut get_av_tags = 24;
 
         BackendError error = 2047;
     }
@@ -44,14 +48,6 @@ message InvalidInputError {
     string info = 1;
 }
 
-message PlusOneIn {
-    int32 num = 1;
-}
-
-message PlusOneOut {
-    int32 num = 1;
-}
-
 message TemplateParseError {
     string info = 1;
 }
@@ -150,3 +146,19 @@ message RenderedTemplateReplacement {
     string current_text = 2;
     repeated string filters = 3;
 }
+
+message GetAVTagsOut {
+    repeated AVTag av_tags = 1;
+}
+
+message AVTag {
+    oneof value {
+        string sound_or_video = 1;
+        TTSTag tts = 2;
+    }
+}
+
+message TTSTag {
+    repeated string args = 1;
+    string text = 2;
+}
diff --git a/pylib/anki/rsbackend.py b/pylib/anki/rsbackend.py
index 8cf741a3b..e342e1fc9 100644
--- a/pylib/anki/rsbackend.py
+++ b/pylib/anki/rsbackend.py
@@ -9,6 +9,7 @@ import ankirspy  # pytype: disable=import-error
 import anki.backend_pb2 as pb
 import anki.buildinfo
 from anki.models import AllTemplateReqs
+from anki.sound import AVTag, SoundOrVideoTag, TTSTag
 
 assert ankirspy.buildhash() == anki.buildinfo.buildhash
 
@@ -45,6 +46,14 @@ def proto_template_reqs_to_legacy(
     return legacy_reqs
 
 
+def av_tag_to_native(tag: pb.AVTag) -> AVTag:
+    val = tag.WhichOneof("value")
+    if val == "sound_or_video":
+        return SoundOrVideoTag(filename=tag.sound_or_video)
+    else:
+        return TTSTag(args=list(tag.tts.args), text=tag.tts.text)
+
+
 @dataclass
 class TemplateReplacement:
     field_name: str
@@ -143,3 +152,16 @@ class RustBackend:
         return self._run_command(
             pb.BackendInput(local_minutes_west=stamp)
         ).local_minutes_west
+
+    def strip_av_tags(self, text: str) -> str:
+        return self._run_command(pb.BackendInput(strip_av_tags=text)).strip_av_tags
+
+    def get_av_tags(self, text: str) -> List[AVTag]:
+        return list(
+            map(
+                av_tag_to_native,
+                self._run_command(
+                    pb.BackendInput(get_av_tags=text)
+                ).get_av_tags.av_tags,
+            )
+        )
diff --git a/pylib/anki/sound.py b/pylib/anki/sound.py
index 74de2e911..a5092333d 100644
--- a/pylib/anki/sound.py
+++ b/pylib/anki/sound.py
@@ -1,18 +1,61 @@
 # Copyright: Ankitects Pty Ltd and contributors
 # License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
 
-import re
-from typing import List
+"""
+Sound/TTS references extracted from card text.
 
-# Shared utils
+Use collection.backend.strip_av_tags(string) to remove all tags,
+and collection.backend.get_av_tags(string) to get a list of AVTags.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+from typing import List, Union
+
+
+@dataclass
+class TTSTag:
+    """Records information about a text to speech tag.
+
+    See tts.py for more information.
+    """
+
+    args: List[str]
+    text: str
+
+
+@dataclass
+class SoundOrVideoTag:
+    """Contains the filename inside a [sound:...] tag.
+
+    Video files also use [sound:...].
+    """
+
+    filename: str
+
+
+# note this does not include image tags, which are handled with HTML.
+AVTag = Union[SoundOrVideoTag, TTSTag]
+
+# Legacy utils
 ##########################################################################
+# these will be removed in the future
 
 _soundReg = r"\[sound:(.*?)\]"
 
 
 def allSounds(text) -> List:
-    return re.findall(_soundReg, text)
+    from aqt import mw
+
+    return [
+        x.filename
+        for x in mw.col.backend.get_av_tags(text)
+        if isinstance(x, SoundOrVideoTag)
+    ]
 
 
 def stripSounds(text) -> str:
-    return re.sub(_soundReg, "", text)
+    from aqt import mw
+
+    return mw.col.backend.strip_av_tags(text)
diff --git a/rslib/Cargo.toml b/rslib/Cargo.toml
index 04c7a766d..222b677d1 100644
--- a/rslib/Cargo.toml
+++ b/rslib/Cargo.toml
@@ -15,6 +15,7 @@ lazy_static = "1.4.0"
 regex = "1.3.3"
 hex = "0.4.0"
 blake3 = "0.1.0"
+htmlescape = "0.3.1"
 
 [build-dependencies]
 prost-build = "0.5.0"
diff --git a/rslib/src/backend.rs b/rslib/src/backend.rs
index 337e9c8ee..470ca4316 100644
--- a/rslib/src/backend.rs
+++ b/rslib/src/backend.rs
@@ -10,6 +10,7 @@ use crate::template::{
     render_card, without_legacy_template_directives, FieldMap, FieldRequirements, ParsedTemplate,
     RenderedNode,
 };
+use crate::text::{av_tags_in_string, strip_av_tags, AVTag};
 use prost::Message;
 use std::collections::{HashMap, HashSet};
 use std::path::PathBuf;
@@ -98,6 +99,8 @@ impl Backend {
             Value::LocalMinutesWest(stamp) => {
                 OValue::LocalMinutesWest(local_minutes_west_for_stamp(stamp))
             }
+            Value::StripAvTags(text) => OValue::StripAvTags(strip_av_tags(&text).into()),
+            Value::GetAvTags(text) => OValue::GetAvTags(self.get_av_tags(&text)),
         })
     }
 
@@ -178,6 +181,24 @@ impl Backend {
             answer_nodes: rendered_nodes_to_proto(anodes),
         })
     }
+
+    fn get_av_tags(&self, text: &str) -> pt::GetAvTagsOut {
+        let tags = av_tags_in_string(text)
+            .map(|avtag| match avtag {
+                AVTag::SoundOrVideo(file) => pt::AvTag {
+                    value: Some(pt::av_tag::Value::SoundOrVideo(file.to_string())),
+                },
+                AVTag::TextToSpeech { args, field_text } => pt::AvTag {
+                    value: Some(pt::av_tag::Value::Tts(pt::TtsTag {
+                        args: args.iter().map(|&s| s.to_string()).collect(),
+                        text: field_text.to_string(),
+                    })),
+                },
+            })
+            .collect();
+
+        pt::GetAvTagsOut { av_tags: tags }
+    }
 }
 
 fn ords_hash_to_set(ords: HashSet<u16>) -> Vec<u32> {
diff --git a/rslib/src/template.rs b/rslib/src/template.rs
index 0858c29a1..d58b7d733 100644
--- a/rslib/src/template.rs
+++ b/rslib/src/template.rs
@@ -3,7 +3,7 @@
 
 use crate::err::{Result, TemplateError};
 use crate::template_filters::apply_filters;
-use crate::text::strip_sounds;
+use crate::text::strip_av_tags;
 use lazy_static::lazy_static;
 use nom;
 use nom::branch::alt;
@@ -443,7 +443,7 @@ pub fn render_card(
     // if the question side didn't have any unknown filters, we can pass
     // FrontSide in now
     if let [RenderedNode::Text { ref text }] = *qnodes.as_slice() {
-        context.front_text = Some(strip_sounds(text));
+        context.front_text = Some(strip_av_tags(text));
     }
 
     // answer side
diff --git a/rslib/src/template_filters.rs b/rslib/src/template_filters.rs
index 5d7207030..e224d1ce3 100644
--- a/rslib/src/template_filters.rs
+++ b/rslib/src/template_filters.rs
@@ -75,8 +75,12 @@ fn apply_filter<'a>(
         // an empty filter name (caused by using two colons) is ignored
         "" => text.into(),
         _ => {
-            // unrecognized filter
-            return (false, None);
+            if filter_name.starts_with("tts ") {
+                tts_filter(filter_name, text)
+            } else {
+                // unrecognized filter
+                return (false, None);
+            }
         }
     };
 
@@ -285,6 +289,11 @@ return false;">
     .into()
 }
 
+fn tts_filter(filter_name: &str, text: &str) -> Cow<'static, str> {
+    let args = filter_name.splitn(2, ' ').nth(1).unwrap_or("");
+
+    format!("[anki:tts][{}]{}[/anki:tts]", args, text).into()
+}
 // Tests
 //----------------------------------------
 
@@ -293,7 +302,7 @@ mod test {
     use crate::template::RenderContext;
     use crate::template_filters::{
         apply_filters, cloze_filter, furigana_filter, hint_filter, kana_filter, kanji_filter,
-        type_cloze_filter, type_filter,
+        tts_filter, type_cloze_filter, type_filter,
     };
     use crate::text::strip_html;
 
@@ -368,4 +377,12 @@ field</a>
         ctx.card_ord = 2;
         assert_eq!(cloze_filter(text, &ctx).as_ref(), "");
     }
+
+    #[test]
+    fn test_tts() {
+        assert_eq!(
+            tts_filter("tts lang=en_US", "foo"),
+            "[anki:tts][lang=en_US]foo[/anki:tts]"
+        );
+    }
 }
diff --git a/rslib/src/text.rs b/rslib/src/text.rs
index ef81be90c..f07fe1f48 100644
--- a/rslib/src/text.rs
+++ b/rslib/src/text.rs
@@ -1,12 +1,22 @@
 // Copyright: Ankitects Pty Ltd and contributors
 // License: GNU AGPL, version 3 or later; http://www.gnu.org/licenses/agpl.html
 
+use htmlescape;
 use lazy_static::lazy_static;
 use regex::Regex;
 use std::borrow::Cow;
 use std::collections::HashSet;
 use std::ptr;
 
+#[derive(Debug, PartialEq)]
+pub enum AVTag<'a> {
+    SoundOrVideo(Cow<'a, str>),
+    TextToSpeech {
+        args: Vec<&'a str>,
+        field_text: Cow<'a, str>,
+    },
+}
+
 lazy_static! {
     static ref HTML: Regex = Regex::new(concat!(
         "(?si)",
@@ -22,9 +32,16 @@ lazy_static! {
         r#"(?i)<img[^>]+src=["']?([^"'>]+)["']?[^>]*>"#
     ).unwrap();
 
-    static ref SOUND_TAG: Regex = Regex::new(
-        r"\[sound:(.*?)\]"
-    ).unwrap();
+    // videos are also in sound tags
+    static ref AV_TAGS: Regex = Regex::new(
+        r#"(?xs)
+            \[sound:(.*?)\]     # 1 - the filename in a sound tag
+            |
+            \[anki:tts\]
+                \[(.*?)\]       # 2 - arguments to tts call
+                (.*?)           # 3 - field text
+            \[/anki:tts\]
+            "#).unwrap();
 
     static ref CLOZED_TEXT: Regex = Regex::new(
         r"(?s)\{\{c(\d+)::.+?\}\}"
@@ -35,8 +52,43 @@ pub fn strip_html(html: &str) -> Cow<str> {
     HTML.replace_all(html, "")
 }
 
-pub fn strip_sounds(html: &str) -> Cow<str> {
-    SOUND_TAG.replace_all(html, "")
+pub fn decode_entities(html: &str) -> Cow<str> {
+    if html.contains('&') {
+        match htmlescape::decode_html(html) {
+            Ok(text) => text,
+            Err(e) => format!("{:?}", e),
+        }
+        .into()
+    } else {
+        // nothing to do
+        html.into()
+    }
+}
+
+pub fn strip_html_for_tts(html: &str) -> Cow<str> {
+    match HTML.replace_all(html, " ") {
+        Cow::Borrowed(_) => decode_entities(html),
+        Cow::Owned(s) => decode_entities(&s).to_string().into(),
+    }
+}
+
+pub fn strip_av_tags(text: &str) -> Cow<str> {
+    AV_TAGS.replace_all(text, "")
+}
+
+pub fn av_tags_in_string(text: &str) -> impl Iterator<Item = AVTag> {
+    AV_TAGS.captures_iter(text).map(|caps| {
+        if let Some(av_file) = caps.get(1) {
+            AVTag::SoundOrVideo(decode_entities(av_file.as_str()))
+        } else {
+            let args = caps.get(2).unwrap();
+            let field_text = caps.get(3).unwrap();
+            AVTag::TextToSpeech {
+                args: args.as_str().split(' ').collect(),
+                field_text: strip_html_for_tts(field_text.as_str()),
+            }
+        }
+    })
 }
 
 pub fn strip_html_preserving_image_filenames(html: &str) -> Cow<str> {
@@ -64,7 +116,10 @@ pub fn cloze_numbers_in_string(html: &str) -> HashSet<u16> {
 
 #[cfg(test)]
 mod test {
-    use crate::text::{cloze_numbers_in_string, strip_html, strip_html_preserving_image_filenames};
+    use crate::text::{
+        av_tags_in_string, cloze_numbers_in_string, strip_av_tags, strip_html,
+        strip_html_preserving_image_filenames, AVTag,
+    };
     use std::collections::HashSet;
 
     #[test]
@@ -95,4 +150,20 @@ mod test {
             vec![1, 2].into_iter().collect::<HashSet<u16>>()
         );
     }
+
+    #[test]
+    fn test_audio() {
+        let s = "abc[sound:fo&amp;o.mp3]def[anki:tts][lang=en_US voices=Bob,Jane]foo<br>1&gt;2[/anki:tts]gh";
+        assert_eq!(strip_av_tags(s), "abcdefgh");
+        assert_eq!(
+            av_tags_in_string(s).collect::<Vec<_>>(),
+            vec![
+                AVTag::SoundOrVideo("fo&o.mp3".into()),
+                AVTag::TextToSpeech {
+                    args: vec!["lang=en_US", "voices=Bob,Jane"],
+                    field_text: "foo 1>2".into()
+                },
+            ]
+        );
+    }
 }