Skip to content

Commit 543002c

Browse files
authored
Extract aether_url crate and canonicalise all paths (#1220)
Branched from #1219 Progress towards #1212 Extract our canonical `Urld` struct used in Ark to get the legacy LSP and the DAP to agree on file URL identity. Now lives in `aether_url`, a dependency of both Oak and Ark. `File` now stores a `UrlId` so that all URLs get canonicalised at the LSP / Salsa DB boundary. As agreed in our last call @DavisVaughan this will be renamed to `CanonicalUrl` once the current batch of PRs has been merged.
2 parents dfb3738 + 33c4267 commit 543002c

22 files changed

Lines changed: 419 additions & 320 deletions

Cargo.lock

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ aether_factory = { git = "https://github.com/posit-dev/air", package = "air_r_fa
2929
aether_lsp_utils = { git = "https://github.com/posit-dev/air", rev = "d2659d5b158374bf486b594625ca50abbd0ac879" }
3030
aether_parser = { git = "https://github.com/posit-dev/air", package = "air_r_parser", rev = "d2659d5b158374bf486b594625ca50abbd0ac879" }
3131
aether_syntax = { git = "https://github.com/posit-dev/air", package = "air_r_syntax", rev = "d2659d5b158374bf486b594625ca50abbd0ac879" }
32+
aether_url = { path = "crates/aether_url" }
3233
amalthea = { path = "crates/amalthea" }
3334
anyhow = "1.0.102"
3435
ark = { path = "crates/ark" }

crates/aether_url/Cargo.toml

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[package]
2+
name = "aether_url"
3+
version = "0.1.0"
4+
description = "Canonicalised file URL identity (`UrlId`) for matching the same file across heterogeneous URI sources."
5+
authors.workspace = true
6+
edition.workspace = true
7+
rust-version.workspace = true
8+
license.workspace = true
9+
10+
[dependencies]
11+
anyhow.workspace = true
12+
log.workspace = true
13+
stdext.workspace = true
14+
url.workspace = true
15+
16+
[dev-dependencies]
17+
tempfile.workspace = true
18+
19+
[lints]
20+
workspace = true

crates/aether_url/src/lib.rs

Lines changed: 307 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,307 @@
1+
//
2+
// lib.rs
3+
//
4+
// Copyright (C) 2026 Posit Software, PBC. All rights reserved.
5+
//
6+
//
7+
8+
use std::fmt;
9+
10+
use stdext::result::ResultExt;
11+
use url::Url;
12+
13+
/// Canonicalised file URL identity.
14+
///
15+
/// # The multi-source URI reconciliation problem
16+
///
17+
/// File URIs for the same file can arrive from independent sources, each
18+
/// with its own representation:
19+
///
20+
/// - **DAP (SetBreakpoints)**: Receives raw file paths from the frontend,
21+
/// converted to URIs via `UrlId::from_file_path`. These are stored as
22+
/// HashMap keys for breakpoint lookup.
23+
///
24+
/// - **LSP (didChange, etc.)**: Receives URIs directly from the editor
25+
/// client, which may use non-canonical forms (e.g. percent-encoded
26+
/// colons on Windows, or symlinked paths on macOS).
27+
///
28+
/// - **Execute requests**: A frontend may attach a `code_location` URI
29+
/// that comes straight from the editor's document model, again
30+
/// potentially non-canonical.
31+
///
32+
/// - **R runtime**: When R evaluates `source()` or annotates code, it
33+
/// passes URIs that went through R's `normalizePath()`, which resolves
34+
/// symlinks to their canonical target (e.g. `/tmp` resolves to `/private/tmp`
35+
/// on macOS), producing a path the editor never sent. More generally,
36+
/// arbitrary R code can create source references that we may end up
37+
/// consuming for breakpoint or debug purposes, and the paths in those
38+
/// references may or may not be canonical.
39+
///
40+
/// All four sources must agree on file identity. For instance breakpoints set
41+
/// via DAP are looked up in a HashMap keyed by URI when code is executed or
42+
/// sourced, and invalidated when documents change via LSP.
43+
///
44+
/// # Design decision
45+
///
46+
/// We solve this by canonicalizing URIs into [`UrlId`] at every entry
47+
/// point, rather than interning paths into opaque IDs (as rust-analyzer
48+
/// does with its VFS `FileId` approach). Interning would be a larger
49+
/// architectural change and is not warranted here since we only need
50+
/// canonical keys at a handful of call sites.
51+
///
52+
/// Canonicalization uses `std::fs::canonicalize()` to resolve symlinks
53+
/// (e.g. `/tmp` to `/private/tmp` on macOS), round-trips through the
54+
/// filesystem path to normalize encoding variants (e.g. `%3A` to `:` on
55+
/// Windows), and uppercases drive letters on Windows. When the file does
56+
/// not exist on disk, we fall back to the original URI.
57+
///
58+
/// # Important: canonical URIs must not leak
59+
///
60+
/// [`UrlId`] is strictly for internal identity. When a URI flows back
61+
/// to R (e.g. in `#line` directives or injected breakpoint calls) or to
62+
/// the frontend (e.g. in DAP stack frames), always use the original raw
63+
/// URI. The frontend (and possibly R code) expects their own URI
64+
/// representation, and a canonical URI (e.g. `/private/tmp/...` instead of
65+
/// `/tmp/...`) could be treated as a different file (e.g. open a new editor in
66+
/// the frontend instead of an existing one).
67+
///
68+
/// A canonicalized file URI for use as a stable identity key.
69+
///
70+
/// Wraps a [`Url`] that has been canonicalized to resolve symlinks,
71+
/// normalize encoding variants, and uppercase drive letters on Windows.
72+
/// Use this type in HashMaps and anywhere file identity matters.
73+
///
74+
/// Construct via [`UrlId::from_url`], [`UrlId::from_file_path`], or
75+
/// [`UrlId::parse`].
76+
///
77+
/// On Windows, `std::fs::canonicalize()` returns extended-length paths
78+
/// prefixed with `\\?\` (e.g. `\\?\C:\Users\...`). Projects like Ruff
79+
/// use the `dunce` crate to strip this prefix, but we don't need it
80+
/// because `Url::from_file_path` already handles
81+
/// `Prefix::VerbatimDisk` and produces a clean `file:///C:/...` URI.
82+
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
83+
pub struct UrlId(Url);
84+
85+
impl UrlId {
86+
/// Canonicalize a [`Url`] into a [`UrlId`].
87+
///
88+
/// Resolves symlinks via `std::fs::canonicalize()` and normalizes
89+
/// encoding variants (e.g. `%3A` to `:` on Windows). On Windows, also
90+
/// uppercases the drive letter. Falls back to the original URI for
91+
/// non-file schemes or when the path can't be resolved.
92+
pub fn from_url(uri: Url) -> Self {
93+
if uri.scheme() != "file" {
94+
return Self(uri);
95+
}
96+
97+
let Some(path) = uri.to_file_path().warn_on_err() else {
98+
return Self(uri);
99+
};
100+
101+
let path = std::fs::canonicalize(&path).trace_on_err().unwrap_or(path);
102+
let uri = Url::from_file_path(&path)
103+
.map_err(|()| anyhow::anyhow!("Failed to convert path to URI: {path:?}"))
104+
.warn_on_err()
105+
.unwrap_or(uri);
106+
107+
#[cfg(windows)]
108+
let uri = uppercase_windows_drive_in_uri(uri);
109+
110+
Self(uri)
111+
}
112+
113+
/// Wrap a [`Url`] that the caller asserts is already canonical.
114+
pub fn from_canonical(uri: Url) -> Self {
115+
Self(uri)
116+
}
117+
118+
/// Convert a file path to a canonical [`UrlId`].
119+
///
120+
/// Canonicalizes the path to resolve symlinks (e.g. `/var/folders` to
121+
/// `/private/var/folders` on macOS) so the URI matches what R's
122+
/// `normalizePath()` produces. Falls back to the original path if
123+
/// canonicalization fails.
124+
pub fn from_file_path(path: impl AsRef<std::path::Path>) -> anyhow::Result<Self> {
125+
let path = path.as_ref();
126+
let url = Url::from_file_path(path)
127+
.map_err(|()| anyhow::anyhow!("Failed to convert path to URL: {}", path.display()))?;
128+
Ok(Self::from_url(url))
129+
}
130+
131+
/// Parse a URI string into a canonical [`UrlId`].
132+
pub fn parse(s: &str) -> Result<Self, url::ParseError> {
133+
let url = Url::parse(s)?;
134+
Ok(Self::from_url(url))
135+
}
136+
137+
/// Access the inner [`Url`].
138+
pub fn as_url(&self) -> &Url {
139+
&self.0
140+
}
141+
}
142+
143+
impl fmt::Display for UrlId {
144+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
145+
self.0.fmt(f)
146+
}
147+
}
148+
149+
/// Uppercase the drive letter in a Windows file URI for consistent hashing.
150+
#[cfg(windows)]
151+
fn uppercase_windows_drive_in_uri(mut uri: Url) -> Url {
152+
let path = uri.path();
153+
let mut chars = path.chars();
154+
155+
// Match pattern: "/" + drive letter + ":"
156+
let drive = match (chars.next(), chars.next(), chars.next()) {
157+
(Some('/'), Some(drive), Some(':')) if drive.is_ascii_alphabetic() => drive,
158+
_ => return uri,
159+
};
160+
161+
let upper = drive.to_ascii_uppercase();
162+
163+
if drive != upper {
164+
let new_path = format!("/{upper}:{}", &path[3..]);
165+
uri.set_path(&new_path);
166+
}
167+
168+
uri
169+
}
170+
171+
#[cfg(test)]
172+
mod tests {
173+
use super::*;
174+
175+
#[test]
176+
fn test_non_file_unchanged() {
177+
let uri = Url::parse("ark://namespace/test.R").unwrap();
178+
let id = UrlId::from_url(uri.clone());
179+
assert_eq!(*id.as_url(), uri);
180+
}
181+
182+
#[test]
183+
fn test_parse_non_file() {
184+
let id = UrlId::parse("ark://namespace/test.R").unwrap();
185+
assert_eq!(id.as_url().as_str(), "ark://namespace/test.R");
186+
}
187+
188+
#[test]
189+
fn test_equality() {
190+
let id1 = UrlId::parse("file:///home/user/test.R").unwrap();
191+
let id2 = UrlId::parse("file:///home/user/test.R").unwrap();
192+
assert_eq!(id1, id2);
193+
194+
let id3 = UrlId::parse("file:///home/user/other.R").unwrap();
195+
assert_ne!(id1, id3);
196+
}
197+
198+
#[test]
199+
fn test_display() {
200+
let id = UrlId::parse("file:///home/user/test.R").unwrap();
201+
assert_eq!(format!("{id}"), "file:///home/user/test.R");
202+
}
203+
204+
#[test]
205+
#[cfg(not(windows))]
206+
fn test_fallback_for_nonexistent_path() {
207+
// For paths that don't exist, canonicalization falls back to the
208+
// original path so the URI is unchanged.
209+
let uri = Url::parse("file:///nonexistent/path/test.R").unwrap();
210+
let id = UrlId::from_url(uri.clone());
211+
assert_eq!(*id.as_url(), uri);
212+
}
213+
214+
#[test]
215+
#[cfg(target_os = "macos")]
216+
fn test_resolves_tmp_symlink() {
217+
// On macOS, `/tmp` is a symlink to `/private/tmp`. `UrlId` should
218+
// resolve it so that URIs from different sources match.
219+
let dir = tempfile::tempdir_in("/tmp").unwrap();
220+
let file = dir.path().join("test.R");
221+
std::fs::write(&file, "").unwrap();
222+
223+
let non_canonical = Url::from_file_path(&file).unwrap();
224+
assert!(non_canonical.path().starts_with("/tmp/"));
225+
226+
let id = UrlId::from_url(non_canonical);
227+
assert!(id.as_url().path().starts_with("/private/tmp/"));
228+
}
229+
230+
#[test]
231+
#[cfg(not(windows))]
232+
fn test_from_file_path_unix() {
233+
let id = UrlId::from_file_path("/home/user/test.R").unwrap();
234+
assert_eq!(id.as_url().as_str(), "file:///home/user/test.R");
235+
}
236+
237+
// Windows-specific tests
238+
239+
#[test]
240+
#[cfg(windows)]
241+
fn test_decodes_percent_encoded_colon() {
242+
// Positron sends URIs with encoded colon
243+
let uri = Url::parse("file:///c%3A/Users/test/file.R").unwrap();
244+
let id = UrlId::from_url(uri);
245+
assert_eq!(id.as_url().as_str(), "file:///C:/Users/test/file.R");
246+
}
247+
248+
#[test]
249+
#[cfg(windows)]
250+
fn test_decodes_percent_encoded_colon_lowercase_hex() {
251+
// %3a (lowercase hex) variant
252+
let uri = Url::parse("file:///c%3a/Users/test/file.R").unwrap();
253+
let id = UrlId::from_url(uri);
254+
assert_eq!(id.as_url().as_str(), "file:///C:/Users/test/file.R");
255+
}
256+
257+
#[test]
258+
#[cfg(windows)]
259+
fn test_uppercases_drive_letter() {
260+
let uri = Url::parse("file:///c:/Users/test/file.R").unwrap();
261+
let id = UrlId::from_url(uri);
262+
assert_eq!(id.as_url().as_str(), "file:///C:/Users/test/file.R");
263+
}
264+
265+
#[test]
266+
#[cfg(windows)]
267+
fn test_preserves_uppercase_drive() {
268+
let uri = Url::parse("file:///C:/Users/test/file.R").unwrap();
269+
let id = UrlId::from_url(uri.clone());
270+
assert_eq!(*id.as_url(), uri);
271+
}
272+
273+
#[test]
274+
#[cfg(windows)]
275+
fn test_preserves_spaces_encoding() {
276+
// Spaces should remain percent-encoded after round-trip
277+
let uri = Url::parse("file:///C:/Users/test%20user/my%20file.R").unwrap();
278+
let id = UrlId::from_url(uri);
279+
assert_eq!(
280+
id.as_url().as_str(),
281+
"file:///C:/Users/test%20user/my%20file.R"
282+
);
283+
}
284+
285+
#[test]
286+
#[cfg(windows)]
287+
fn test_decodes_colon_preserves_spaces() {
288+
// Both encoded colon and spaces
289+
let uri = Url::parse("file:///c%3A/Users/test%20user/file.R").unwrap();
290+
let id = UrlId::from_url(uri);
291+
assert_eq!(id.as_url().as_str(), "file:///C:/Users/test%20user/file.R");
292+
}
293+
294+
#[test]
295+
#[cfg(windows)]
296+
fn test_parse_windows() {
297+
let id = UrlId::parse("file:///c%3A/Users/test/file.R").unwrap();
298+
assert_eq!(id.as_url().as_str(), "file:///C:/Users/test/file.R");
299+
}
300+
301+
#[test]
302+
#[cfg(windows)]
303+
fn test_from_file_path_windows() {
304+
let id = UrlId::from_file_path("C:\\Users\\test\\file.R").unwrap();
305+
assert_eq!(id.as_url().as_str(), "file:///C:/Users/test/file.R");
306+
}
307+
}

crates/ark/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ ark_macros.workspace = true
2323
aether_lsp_utils.workspace = true
2424
aether_parser.workspace = true
2525
aether_syntax.workspace = true
26+
aether_url.workspace = true
2627
amalthea.workspace = true
2728
anyhow.workspace = true
2829
async-trait.workspace = true

0 commit comments

Comments
 (0)