Skip to content

Commit d25c31d

Browse files
committed
Escape XML reserved characters when writing JATS-formatted text to database
1 parent 28c2740 commit d25c31d

3 files changed

Lines changed: 197 additions & 71 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
55
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
66

77
## [Unreleased]
8+
### Fixed
9+
- [751](https://github.com/thoth-pub/thoth/pull/751) - Escape XML reserved characters when writing JATS-formatted text to database
810

911
## [[1.3.1]](https://github.com/thoth-pub/thoth/releases/tag/v1.3.1) - 2026-05-06
1012
### Security

thoth-api/src/markup/ast.rs

Lines changed: 88 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,19 @@ fn inline_text_to_plain_text(nodes: &[Node]) -> String {
3030
nodes.iter().map(ast_to_plain_text).collect()
3131
}
3232

33+
fn escape_xml_text(input: &str) -> String {
34+
input
35+
.replace('&', "&")
36+
.replace('<', "&lt;")
37+
.replace('>', "&gt;")
38+
}
39+
40+
fn escape_xml_attr(input: &str) -> String {
41+
escape_xml_text(input)
42+
.replace('"', "&quot;")
43+
.replace('\'', "&apos;")
44+
}
45+
3346
fn looks_like_email(text: &str) -> bool {
3447
regex::Regex::new(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$")
3548
.unwrap()
@@ -680,16 +693,16 @@ pub fn plain_text_to_ast(text: &str) -> Node {
680693
pub fn plain_text_ast_to_jats(node: &Node) -> String {
681694
fn render_plain_text_inline(node: &Node) -> String {
682695
match node {
683-
Node::Text(text) => text.clone(),
696+
Node::Text(text) => escape_xml_text(text),
684697
Node::Break => "<break/>".to_string(),
685698
Node::InlineFormula(tex) => {
686699
format!(
687700
"<inline-formula><tex-math>{}</tex-math></inline-formula>",
688-
tex
701+
escape_xml_text(tex)
689702
)
690703
}
691-
Node::Email(email) => format!("<email>{}</email>", email),
692-
Node::Uri(uri) => format!("<uri>{}</uri>", uri),
704+
Node::Email(email) => format!("<email>{}</email>", escape_xml_text(email)),
705+
Node::Uri(uri) => format!("<uri>{}</uri>", escape_xml_text(uri)),
693706
other => ast_to_jats(other),
694707
}
695708
}
@@ -712,16 +725,16 @@ pub fn plain_text_ast_to_jats(node: &Node) -> String {
712725
let inner: String = children.iter().map(render_plain_text_inline).collect();
713726
format!("<p>{}</p>", inner)
714727
}
715-
Node::Text(text) => format!("<p>{}</p>", text),
728+
Node::Text(text) => format!("<p>{}</p>", escape_xml_text(text)),
716729
Node::Break => "<p><break/></p>".to_string(),
717730
Node::InlineFormula(tex) => {
718731
format!(
719732
"<p><inline-formula><tex-math>{}</tex-math></inline-formula></p>",
720-
tex
733+
escape_xml_text(tex)
721734
)
722735
}
723-
Node::Email(email) => format!("<p><email>{}</email></p>", email),
724-
Node::Uri(uri) => format!("<p><uri>{}</uri></p>", uri),
736+
Node::Email(email) => format!("<p><email>{}</email></p>", escape_xml_text(email)),
737+
Node::Uri(uri) => format!("<p><uri>{}</uri></p>", escape_xml_text(uri)),
725738
_ => {
726739
// For other nodes, use regular ast_to_jats
727740
ast_to_jats(node)
@@ -780,17 +793,21 @@ pub fn ast_to_jats(node: &Node) -> String {
780793
}
781794
Node::Link { url, text } => {
782795
let inner: String = text.iter().map(ast_to_jats).collect();
783-
format!(r#"<ext-link xlink:href="{}">{}</ext-link>"#, url, inner)
796+
format!(
797+
r#"<ext-link xlink:href="{}">{}</ext-link>"#,
798+
escape_xml_attr(url),
799+
inner
800+
)
784801
}
785802
Node::InlineFormula(tex) => {
786803
format!(
787804
"<inline-formula><tex-math>{}</tex-math></inline-formula>",
788-
tex
805+
escape_xml_text(tex)
789806
)
790807
}
791-
Node::Email(email) => format!("<email>{}</email>", email),
792-
Node::Uri(uri) => format!("<uri>{}</uri>", uri),
793-
Node::Text(text) => text.clone(),
808+
Node::Email(email) => format!("<email>{}</email>", escape_xml_text(email)),
809+
Node::Uri(uri) => format!("<uri>{}</uri>", escape_xml_text(uri)),
810+
Node::Text(text) => escape_xml_text(text),
794811
}
795812
}
796813

@@ -1997,6 +2014,64 @@ mod tests {
19972014
);
19982015
}
19992016

2017+
#[test]
2018+
fn test_ast_to_jats_escapes_reserved_xml_chars() {
2019+
let ast = Node::Paragraph(vec![
2020+
Node::Text("x < y & z > w ".to_string()),
2021+
Node::InlineFormula("a < b & c".to_string()),
2022+
Node::Text(" ".to_string()),
2023+
Node::Email("user@example.org".to_string()),
2024+
Node::Text(" ".to_string()),
2025+
Node::Uri("https://example.org?a=1&b=2".to_string()),
2026+
]);
2027+
2028+
let jats = ast_to_jats(&ast);
2029+
assert_eq!(
2030+
jats,
2031+
"<p>x &lt; y &amp; z &gt; w <inline-formula><tex-math>a &lt; b &amp; c</tex-math></inline-formula> <email>user@example.org</email> <uri>https://example.org?a=1&amp;b=2</uri></p>"
2032+
);
2033+
}
2034+
2035+
#[test]
2036+
fn test_ast_to_jats_escapes_link_url_attribute() {
2037+
let ast = Node::Link {
2038+
url: "https://example.com?a=1&b=2".to_string(),
2039+
text: vec![Node::Text("Link text".to_string())],
2040+
};
2041+
2042+
let jats = ast_to_jats(&ast);
2043+
assert_eq!(
2044+
jats,
2045+
r#"<ext-link xlink:href="https://example.com?a=1&amp;b=2">Link text</ext-link>"#
2046+
);
2047+
}
2048+
2049+
#[test]
2050+
fn test_ast_to_jats_preserves_generated_tags() {
2051+
let ast = Node::Paragraph(vec![
2052+
Node::Bold(vec![Node::Text("Bold".to_string())]),
2053+
Node::Text(" and ".to_string()),
2054+
Node::Italic(vec![Node::Text("italic".to_string())]),
2055+
]);
2056+
2057+
let jats = ast_to_jats(&ast);
2058+
assert_eq!(jats, "<p><bold>Bold</bold> and <italic>italic</italic></p>");
2059+
assert!(!jats.contains("&lt;bold&gt;"));
2060+
}
2061+
2062+
#[test]
2063+
fn test_ast_to_jats_preserves_generated_tags_and_escapes() {
2064+
let ast = Node::Paragraph(vec![
2065+
Node::Bold(vec![Node::Text("Bo<ld".to_string())]),
2066+
Node::Text(" & ".to_string()),
2067+
Node::Italic(vec![Node::Text("ita>lic".to_string())]),
2068+
]);
2069+
2070+
let jats = ast_to_jats(&ast);
2071+
assert_eq!(jats, "<p><bold>Bo&lt;ld</bold> &amp; <italic>ita&gt;lic</italic></p>");
2072+
assert!(!jats.contains("&lt;bold&gt;"));
2073+
}
2074+
20002075
#[test]
20012076
fn test_ast_to_jats_break_formula_email_and_uri() {
20022077
let ast = Node::Paragraph(vec![

thoth-api/src/markup/mod.rs

Lines changed: 107 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ pub fn convert_to_jats(
315315
format: MarkupFormat,
316316
conversion_limit: ConversionLimit,
317317
) -> ThothResult<String> {
318-
if format == MarkupFormat::JatsXml {
318+
let output = if format == MarkupFormat::JatsXml {
319319
let content_looks_like_jats = looks_like_markup(&content);
320320
let ast = if content_looks_like_jats {
321321
validate_jats_subset(&content, conversion_limit)?;
@@ -332,70 +332,61 @@ pub fn convert_to_jats(
332332

333333
validate_ast_content(&processed_ast, conversion_limit)?;
334334

335-
return Ok(
336-
if content_looks_like_jats || conversion_limit == ConversionLimit::Title {
337-
ast_to_jats(&processed_ast)
338-
} else {
339-
plain_text_ast_to_jats(&processed_ast)
340-
},
341-
);
342-
}
335+
if content_looks_like_jats || conversion_limit == ConversionLimit::Title {
336+
ast_to_jats(&processed_ast)
337+
} else {
338+
plain_text_ast_to_jats(&processed_ast)
339+
}
340+
} else {
341+
validate_format(&content, &format)?;
343342

344-
validate_format(&content, &format)?;
343+
match format {
344+
MarkupFormat::Html => {
345+
let ast = html_to_ast(&content);
346+
let processed_ast = if conversion_limit == ConversionLimit::Title {
347+
strip_structural_elements_from_ast_for_conversion(&ast)
348+
} else {
349+
ast
350+
};
345351

346-
match format {
347-
MarkupFormat::Html => {
348-
// Use ast library to parse HTML and convert to JATS
349-
let ast = html_to_ast(&content);
350-
351-
// For title conversion, strip structural elements before validation
352-
let processed_ast = if conversion_limit == ConversionLimit::Title {
353-
strip_structural_elements_from_ast_for_conversion(&ast)
354-
} else {
355-
ast
356-
};
357-
358-
validate_ast_content(&processed_ast, conversion_limit)?;
359-
Ok(ast_to_jats(&processed_ast))
360-
}
352+
validate_ast_content(&processed_ast, conversion_limit)?;
353+
ast_to_jats(&processed_ast)
354+
}
361355

362-
MarkupFormat::Markdown => {
363-
// Use ast library to parse Markdown and convert to JATS
364-
let ast = markdown_to_ast(&content);
365-
366-
// For title conversion, strip structural elements before validation
367-
let processed_ast = if conversion_limit == ConversionLimit::Title {
368-
strip_structural_elements_from_ast_for_conversion(&ast)
369-
} else {
370-
ast
371-
};
372-
373-
validate_ast_content(&processed_ast, conversion_limit)?;
374-
Ok(ast_to_jats(&processed_ast))
375-
}
356+
MarkupFormat::Markdown => {
357+
let ast = markdown_to_ast(&content);
358+
let processed_ast = if conversion_limit == ConversionLimit::Title {
359+
strip_structural_elements_from_ast_for_conversion(&ast)
360+
} else {
361+
ast
362+
};
376363

377-
MarkupFormat::PlainText => {
378-
// Use ast library to parse plain text and convert to JATS
379-
let ast = plain_text_to_ast(&content);
380-
381-
// For title conversion, strip structural elements before validation
382-
let processed_ast = if conversion_limit == ConversionLimit::Title {
383-
strip_structural_elements_from_ast_for_conversion(&ast)
384-
} else {
385-
ast
386-
};
387-
388-
validate_ast_content(&processed_ast, conversion_limit)?;
389-
Ok(if conversion_limit == ConversionLimit::Title {
390-
// Title JATS should remain inline (no paragraph wrapper)
364+
validate_ast_content(&processed_ast, conversion_limit)?;
391365
ast_to_jats(&processed_ast)
392-
} else {
393-
plain_text_ast_to_jats(&processed_ast)
394-
})
366+
}
367+
368+
MarkupFormat::PlainText => {
369+
let ast = plain_text_to_ast(&content);
370+
let processed_ast = if conversion_limit == ConversionLimit::Title {
371+
strip_structural_elements_from_ast_for_conversion(&ast)
372+
} else {
373+
ast
374+
};
375+
376+
validate_ast_content(&processed_ast, conversion_limit)?;
377+
if conversion_limit == ConversionLimit::Title {
378+
ast_to_jats(&processed_ast)
379+
} else {
380+
plain_text_ast_to_jats(&processed_ast)
381+
}
382+
}
383+
384+
MarkupFormat::JatsXml => unreachable!("handled above"),
395385
}
386+
};
396387

397-
MarkupFormat::JatsXml => unreachable!("handled above"),
398-
}
388+
validate_jats_subset(&output, conversion_limit)?;
389+
Ok(output)
399390
}
400391

401392
/// normalise stored abstract-like markup into the subset we safely emit to Crossref.
@@ -607,6 +598,64 @@ mod tests {
607598
assert_eq!(output, "<p>Hello <uri>https://example.com</uri> world</p>");
608599
}
609600

601+
#[test]
602+
fn test_plain_text_abstract_escapes_reserved_xml_chars() {
603+
let input = "x < y & z > w";
604+
let output = convert_to_jats(
605+
input.to_string(),
606+
MarkupFormat::PlainText,
607+
ConversionLimit::Abstract,
608+
)
609+
.unwrap();
610+
611+
assert_eq!(output, "<p>x &lt; y &amp; z &gt; w</p>");
612+
}
613+
614+
#[test]
615+
fn test_plain_text_jatsxml_abstract_escapes_reserved_xml_chars() {
616+
let input = "x < y & z > w";
617+
let output = convert_to_jats(
618+
input.to_string(),
619+
MarkupFormat::JatsXml,
620+
ConversionLimit::Abstract,
621+
)
622+
.unwrap();
623+
624+
assert_eq!(output, "<p>x &lt; y &amp; z &gt; w</p>");
625+
}
626+
627+
#[test]
628+
fn test_plain_text_rich_content_escapes_xml_reserved_chars() {
629+
let input = "x < y & user@example.org https://example.org?a=1&b=2 $a < b & c$ > w";
630+
let output = convert_to_jats(
631+
input.to_string(),
632+
MarkupFormat::PlainText,
633+
ConversionLimit::Abstract,
634+
)
635+
.unwrap();
636+
637+
assert_eq!(
638+
output,
639+
"<p>x &lt; y &amp; <email>user@example.org</email> <uri>https://example.org?a=1&amp;b=2</uri> <inline-formula><tex-math>a &lt; b &amp; c</tex-math></inline-formula> &gt; w</p>"
640+
);
641+
}
642+
643+
#[test]
644+
fn test_html_link_url_escapes_reserved_xml_chars() {
645+
let input = r#"<a href="https://example.com?a=1&amp;b=2">Link</a>"#;
646+
let output = convert_to_jats(
647+
input.to_string(),
648+
MarkupFormat::Html,
649+
ConversionLimit::Abstract,
650+
)
651+
.unwrap();
652+
653+
assert_eq!(
654+
output,
655+
r#"<p><ext-link xlink:href="https://example.com?a=1&amp;b=2">Link</ext-link></p>"#
656+
);
657+
}
658+
610659
#[test]
611660
fn test_plain_text_no_url() {
612661
let input = "Just plain text.";

0 commit comments

Comments
 (0)