Skip to content

Commit f8f548f

Browse files
committed
Sanitize proto comments for rustdoc in generated code
Proto-comment passthrough (#7) emits proto source comments as rustdoc `///` lines. Upstream WKT/descriptor proto comments contain markdown that rustdoc misparses: - `[google.protobuf.Duration][]` is treated as a broken intra-doc link and fails under `deny(rustdoc::broken_intra_doc_links)` (4 errors in buffa-types/src/generated/google.protobuf.any.rs). - Bare URLs trigger `rustdoc::bare_urls` (1 warning in buffa-descriptor). - `Option<T>` etc. trigger `rustdoc::invalid_html_tags` (48 warnings in buffa-test). This broke `cargo doc --workspace --no-deps` and would break the docs.rs build on publish. Adds `sanitize_line` in buffa-codegen/src/comments.rs which is applied to prose lines (code blocks left untouched): - Escapes `[`/`]` to `\[`/`\]` unless part of a single-line inline link `[text](url)`. - Wraps bare `http(s)://` URLs in `<...>`. - Escapes `<`/`>` to `\<`/`\>` unless part of an existing `<http(s)://...>` autolink. - Preserves backtick code spans (CommonMark run-length matching). - Passes through already-escaped `\[` etc. Regenerated checked-in WKT and bootstrap descriptor types. Includes drive-by `cargo fmt` for two spots in buffa-codegen that had drifted on main.
1 parent cd002f6 commit f8f548f

File tree

7 files changed

+319
-44
lines changed

7 files changed

+319
-44
lines changed

buffa-codegen/src/comments.rs

Lines changed: 283 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -275,10 +275,13 @@ fn doc_lines_to_tokens(text: &str) -> TokenStream {
275275
}
276276
} else if line.is_empty() {
277277
lines.push(String::new());
278-
} else if line.starts_with(' ') {
279-
lines.push(line.to_string());
280278
} else {
281-
lines.push(format!(" {line}"));
279+
let sanitized = sanitize_line(line);
280+
if sanitized.starts_with(' ') {
281+
lines.push(sanitized);
282+
} else {
283+
lines.push(format!(" {sanitized}"));
284+
}
282285
}
283286
}
284287

@@ -291,6 +294,195 @@ fn doc_lines_to_tokens(text: &str) -> TokenStream {
291294
}
292295
}
293296

297+
/// Escape one prose line of proto comment text for rustdoc.
298+
///
299+
/// Proto comments are written for a cross-language audience and frequently
300+
/// contain constructs that rustdoc misparses:
301+
///
302+
/// - `[foo]` / `[foo][]` — treated as intra-doc links; an error under
303+
/// `deny(rustdoc::broken_intra_doc_links)` when `foo` resembles a Rust
304+
/// path. Escaped to `\[foo\]`.
305+
/// - Bare `http(s)://…` — triggers `rustdoc::bare_urls`. Wrapped in `<…>`.
306+
/// - `Option<T>` — treated as raw HTML; triggers
307+
/// `rustdoc::invalid_html_tags`. Escaped to `Option\<T\>`.
308+
///
309+
/// Left intact:
310+
/// - Single-line inline links `[text](url)`.
311+
/// - Existing autolinks `<http(s)://…>`.
312+
/// - Content inside `` `…` `` backtick code spans.
313+
/// - Already-escaped `\[`, `\]`, `\<`, `\>`.
314+
///
315+
/// This is a per-line pass invoked from [`doc_lines_to_tokens`] on prose
316+
/// lines only — code blocks are left untouched. Multi-line markdown links
317+
/// are conservatively escaped; the link degrades to literal text plus a
318+
/// clickable autolink, which is preferable to a docs.rs build failure.
319+
fn sanitize_line(line: &str) -> String {
320+
let bytes = line.as_bytes();
321+
let mut out = String::with_capacity(line.len());
322+
let mut i = 0;
323+
324+
while i < bytes.len() {
325+
let b = bytes[i];
326+
327+
if b == b'`' {
328+
// CommonMark code span: a run of N backticks opens, the next run
329+
// of exactly N closes. Emit the whole span verbatim. If no
330+
// closer exists on this line, the run is literal (not a span).
331+
let run_start = i;
332+
while i < bytes.len() && bytes[i] == b'`' {
333+
i += 1;
334+
}
335+
let run_len = i - run_start;
336+
if let Some(close_end) = find_backtick_closer(bytes, i, run_len) {
337+
out.push_str(&line[run_start..close_end]);
338+
i = close_end;
339+
} else {
340+
out.push_str(&line[run_start..i]);
341+
}
342+
continue;
343+
}
344+
345+
match b {
346+
b'\\' => {
347+
out.push('\\');
348+
i += 1;
349+
if i < bytes.len() {
350+
i += push_char_at(&mut out, line, i);
351+
}
352+
}
353+
b'[' => {
354+
if let Some(end) = find_inline_link_end(bytes, i) {
355+
out.push_str(&line[i..=end]);
356+
i = end + 1;
357+
} else {
358+
out.push_str("\\[");
359+
i += 1;
360+
}
361+
}
362+
b']' => {
363+
out.push_str("\\]");
364+
i += 1;
365+
}
366+
b'<' => {
367+
if let Some(end) = find_autolink_end(bytes, i) {
368+
out.push_str(&line[i..=end]);
369+
i = end + 1;
370+
} else {
371+
out.push_str("\\<");
372+
i += 1;
373+
}
374+
}
375+
b'>' => {
376+
out.push_str("\\>");
377+
i += 1;
378+
}
379+
b'h' => {
380+
if let Some(end) = find_bare_url_end(bytes, i) {
381+
out.push('<');
382+
out.push_str(&line[i..end]);
383+
out.push('>');
384+
i = end;
385+
} else {
386+
out.push('h');
387+
i += 1;
388+
}
389+
}
390+
_ => {
391+
i += push_char_at(&mut out, line, i);
392+
}
393+
}
394+
}
395+
out
396+
}
397+
398+
/// Push the UTF-8 char at byte index `i` of `s` into `out`, returning its
399+
/// byte length. `i` must be a char boundary and `< s.len()`.
400+
fn push_char_at(out: &mut String, s: &str, i: usize) -> usize {
401+
let ch = s[i..]
402+
.chars()
403+
.next()
404+
.expect("i is in bounds and on a char boundary");
405+
out.push(ch);
406+
ch.len_utf8()
407+
}
408+
409+
/// Starting at `from` (just past an opening run of `run_len` backticks),
410+
/// return the past-the-end index of the matching closing run, or `None`.
411+
fn find_backtick_closer(bytes: &[u8], from: usize, run_len: usize) -> Option<usize> {
412+
let mut i = from;
413+
while i < bytes.len() {
414+
if bytes[i] == b'`' {
415+
let start = i;
416+
while i < bytes.len() && bytes[i] == b'`' {
417+
i += 1;
418+
}
419+
if i - start == run_len {
420+
return Some(i);
421+
}
422+
} else {
423+
i += 1;
424+
}
425+
}
426+
None
427+
}
428+
429+
/// If `bytes[start..]` is a complete `[text](url)`, return the index of the
430+
/// closing `)`. Nested `(`/`)` inside the URL are balanced one level deep so
431+
/// fragments like `…#method()` survive.
432+
fn find_inline_link_end(bytes: &[u8], start: usize) -> Option<usize> {
433+
debug_assert_eq!(bytes[start], b'[');
434+
let mut j = start + 1;
435+
while j < bytes.len() && bytes[j] != b']' {
436+
if bytes[j] == b'[' {
437+
return None;
438+
}
439+
j += 1;
440+
}
441+
if j + 1 >= bytes.len() || bytes[j + 1] != b'(' {
442+
return None;
443+
}
444+
let mut depth = 1i32;
445+
let mut k = j + 2;
446+
while k < bytes.len() {
447+
match bytes[k] {
448+
b'(' => depth += 1,
449+
b')' => {
450+
depth -= 1;
451+
if depth == 0 {
452+
return Some(k);
453+
}
454+
}
455+
_ => {}
456+
}
457+
k += 1;
458+
}
459+
None
460+
}
461+
462+
/// If `bytes[start..]` is `<http(s)://…>`, return the index of the `>`.
463+
fn find_autolink_end(bytes: &[u8], start: usize) -> Option<usize> {
464+
debug_assert_eq!(bytes[start], b'<');
465+
let rest = &bytes[start + 1..];
466+
if !(rest.starts_with(b"http://") || rest.starts_with(b"https://")) {
467+
return None;
468+
}
469+
rest.iter().position(|&b| b == b'>').map(|p| start + 1 + p)
470+
}
471+
472+
/// If `bytes[start..]` begins a bare `http(s)://` URL, return the
473+
/// past-the-end byte index. The URL ends at whitespace or `)`.
474+
fn find_bare_url_end(bytes: &[u8], start: usize) -> Option<usize> {
475+
let rest = &bytes[start..];
476+
if !(rest.starts_with(b"http://") || rest.starts_with(b"https://")) {
477+
return None;
478+
}
479+
let mut j = start;
480+
while j < bytes.len() && !bytes[j].is_ascii_whitespace() && bytes[j] != b')' {
481+
j += 1;
482+
}
483+
Some(j)
484+
}
485+
294486
/// Format a `SourceCodeInfo.Location` into a doc-comment string.
295487
///
296488
/// Combines leading detached comments, leading comments, and trailing
@@ -744,4 +936,92 @@ mod tests {
744936
};
745937
assert_eq!(format_comment(&loc).as_deref(), Some(" hello"));
746938
}
939+
940+
// --- sanitize_line ------------------------------------------------------
941+
942+
#[test]
943+
fn test_sanitize_line() {
944+
let cases: &[(&str, &str, &str)] = &[
945+
("plain text", "plain text", "plain"),
946+
("hello world", "hello world", "h_not_url"),
947+
// Brackets
948+
(
949+
"see [google.protobuf.Duration][]",
950+
r"see \[google.protobuf.Duration\]\[\]",
951+
"collapsed_ref_link",
952+
),
953+
("a [foo] b", r"a \[foo\] b", "shortcut_link"),
954+
("[foo][bar]", r"\[foo\]\[bar\]", "full_ref_link"),
955+
("[.{frac_sec}]Z", r"\[.{frac_sec}\]Z", "format_string"),
956+
// Inline links preserved
957+
(
958+
"[RFC 3339](https://ietf.org/rfc/rfc3339.txt)",
959+
"[RFC 3339](https://ietf.org/rfc/rfc3339.txt)",
960+
"inline_link",
961+
),
962+
(
963+
"[m()](https://e.com/#m())",
964+
"[m()](https://e.com/#m())",
965+
"inline_link_nested_parens",
966+
),
967+
// Already escaped
968+
(r"\[foo\]", r"\[foo\]", "pre_escaped"),
969+
// Backtick spans untouched
970+
("`[foo]` bar", "`[foo]` bar", "backtick_brackets"),
971+
("`Option<T>` bar", "`Option<T>` bar", "backtick_generics"),
972+
("``[foo]``", "``[foo]``", "double_backtick"),
973+
("`` `<T>` ``", "`` `<T>` ``", "double_backtick_inner"),
974+
("``` x ` y ```", "``` x ` y ```", "triple_backtick"),
975+
("`` no closer", "`` no closer", "unclosed_backticks"),
976+
(
977+
"[résumé](http://e.com)",
978+
"[résumé](http://e.com)",
979+
"utf8_link_text",
980+
),
981+
// Bare URLs wrapped
982+
(
983+
"see https://example.com/x for details",
984+
"see <https://example.com/x> for details",
985+
"bare_url",
986+
),
987+
(
988+
"(https://example.com)",
989+
"(<https://example.com>)",
990+
"bare_url_in_parens",
991+
),
992+
// Existing autolinks preserved
993+
("<https://example.com>", "<https://example.com>", "autolink"),
994+
// Angle brackets escaped
995+
("Option<T>", r"Option\<T\>", "generics"),
996+
("HashMap<K, V>", r"HashMap\<K, V\>", "generics_multi"),
997+
// UTF-8 passthrough
998+
("café — ok", "café — ok", "utf8"),
999+
("`café` [x]", r"`café` \[x\]", "utf8_backtick"),
1000+
];
1001+
for (input, want, name) in cases {
1002+
assert_eq!(sanitize_line(input), *want, "case: {name}");
1003+
}
1004+
}
1005+
1006+
#[test]
1007+
fn test_sanitize_line_unbalanced() {
1008+
// Unmatched delimiters are escaped, not crashed on.
1009+
assert_eq!(sanitize_line("[foo"), r"\[foo");
1010+
assert_eq!(sanitize_line("foo]"), r"foo\]");
1011+
assert_eq!(sanitize_line("[foo]("), r"\[foo\](");
1012+
assert_eq!(sanitize_line("<http://x"), r"\<<http://x>");
1013+
assert_eq!(sanitize_line("a > b"), r"a \> b");
1014+
assert_eq!(sanitize_line("trailing \\"), "trailing \\");
1015+
}
1016+
1017+
#[test]
1018+
fn test_doc_tokens_sanitizes_prose_not_code() {
1019+
// Indented code block content must NOT be sanitized.
1020+
let out = doc_tokens("Prose [foo].\n code [bar]\nMore.");
1021+
assert!(out.contains(r"\\[foo\\]"), "prose escaped: {out}");
1022+
assert!(out.contains("code [bar]"), "code untouched: {out}");
1023+
// User-written fence content must NOT be sanitized.
1024+
let out = doc_tokens("```\n[x](y)\nOption<T>\n```");
1025+
assert!(out.contains("Option<T>"), "fence untouched: {out}");
1026+
}
7471027
}

buffa-codegen/src/lib.rs

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -428,13 +428,9 @@ fn check_nested_type_oneof_conflicts(file: &FileDescriptorProto) -> Result<(), C
428428
// type names. Synthetic oneofs (created by proto3 `optional` fields)
429429
// never produce a Rust enum, so they cannot collide.
430430
for (idx, oneof) in msg.oneof_decl.iter().enumerate() {
431-
let has_real_fields = msg
432-
.field
433-
.iter()
434-
.any(|f| {
435-
crate::impl_message::is_real_oneof_member(f)
436-
&& f.oneof_index == Some(idx as i32)
437-
});
431+
let has_real_fields = msg.field.iter().any(|f| {
432+
crate::impl_message::is_real_oneof_member(f) && f.oneof_index == Some(idx as i32)
433+
});
438434
if !has_real_fields {
439435
continue;
440436
}

buffa-codegen/src/tests/naming.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,8 +280,7 @@ fn test_proto3_optional_field_name_matches_nested_enum_no_conflict() {
280280
}],
281281
field: vec![{
282282
let mut f = make_field("match_operator", 4, Label::LABEL_OPTIONAL, Type::TYPE_ENUM);
283-
f.type_name =
284-
Some(".minimal.StringFieldMatcher.MatchOperator".to_string());
283+
f.type_name = Some(".minimal.StringFieldMatcher.MatchOperator".to_string());
285284
f.oneof_index = Some(0);
286285
f.proto3_optional = Some(true);
287286
f

0 commit comments

Comments
 (0)