Skip to content
Open
Show file tree
Hide file tree
Changes from 52 commits
Commits
Show all changes
56 commits
Select commit Hold shift + click to select a range
8fbb1f1
Introduce AbstractCodePointTrie, Latin1 getter, and UTF-8 getters
hsivonen Dec 4, 2025
73356ab
Decouple UAX 15 and UTS 46 trie types
hsivonen Dec 5, 2025
9a4983a
Add iterators that also do trie lookups
hsivonen Dec 12, 2025
ed51573
Use trie-aware iterators in the normalizer
hsivonen Dec 16, 2025
da07c72
Use fused trie lookup and UTF decoding in the collator
hsivonen Dec 18, 2025
da4183c
Add functions for normalizing Latin1 to UTF-16
hsivonen Dec 4, 2025
9db4973
Prepare for Gecko
hsivonen Dec 30, 2025
db21369
Implement new data layout for canonical composition data
hsivonen Jan 7, 2026
7e7b618
Optimize NFD
hsivonen Jan 22, 2026
67c7783
Optimize Latin1
hsivonen Jan 22, 2026
2e2611c
Stay on NFD fast track for single combining mark
hsivonen Jan 23, 2026
a481c01
Rework Latin1 norm
hsivonen Jan 26, 2026
5063f39
Likely/unlikely now used in non-UTF-16
hsivonen Jan 23, 2026
e291315
Merge remote-tracking branch 'origin/latin1chunk' into nfdsinglemark
hsivonen Jan 26, 2026
9b6b8ce
Tweak passthrough bounds
hsivonen Jan 28, 2026
6110491
Prepare for Gecko landing
hsivonen Jan 30, 2026
3480e1e
if outside loop
hsivonen Jan 30, 2026
e6ecb7c
Fix clippy lints
hsivonen Jan 30, 2026
be60b32
Removed stale conditional compliation
hsivonen Jan 30, 2026
2e9c82c
Merge branch 'main' into normreview
hsivonen Jan 30, 2026
338a42c
Merge branch 'main' into normreview
hsivonen Feb 5, 2026
212299e
Collator perf notes
hsivonen Feb 5, 2026
6f76a5a
Move Hangul syllable handling
hsivonen Feb 5, 2026
34524c4
Make init faster
hsivonen Feb 5, 2026
2937a74
Add remark about invariant relaxation
hsivonen Feb 6, 2026
3680b2d
Fast primary check after identical prefix
hsivonen Feb 6, 2026
7f8cbc7
Add a quick primary check when there is no identical prefix
hsivonen Feb 6, 2026
bd8b5bb
Optimize the first difference being a comparison between Hangul sylla…
hsivonen Feb 6, 2026
e04a69d
Revise comments
hsivonen Feb 6, 2026
2ca0dac
Restore the less useful collator benches
hsivonen Feb 6, 2026
3bd9f21
Use next_back instead of rev and next for head_chars
hsivonen Feb 9, 2026
84e599b
Turn of useless benches again
hsivonen Feb 9, 2026
4943ad6
Tweak the identical prefix fast path
hsivonen Feb 10, 2026
dc9775c
Merge branch 'main' into collatoropt
hsivonen Feb 10, 2026
4cc1d79
Merge branch 'main' into normreview
hsivonen Feb 10, 2026
57e605d
Merge branch 'normreview' into collatoropt
hsivonen Feb 10, 2026
5d16ee8
Prepare for trie customization
hsivonen Feb 10, 2026
6b25873
cargo fmt
hsivonen Feb 10, 2026
19ae6f4
Merge branch 'normreview' into collatoropt
hsivonen Feb 10, 2026
bf4f38f
Tailoring-specific trie type
hsivonen Feb 10, 2026
afb446b
Make more languages fast
hsivonen Feb 10, 2026
045abf8
Prepare to hoist CE32s
hsivonen Feb 11, 2026
00a1d2b
Back to small tries for now
hsivonen Feb 11, 2026
629fefd
Copy from root to tailoring
hsivonen Feb 11, 2026
4198d6a
Replace missing tailoring with root earlier
hsivonen Feb 11, 2026
932983f
Hoist Hiragana
hsivonen Feb 12, 2026
5c0ab1d
No fallback to root in the quick primary check at the very start
hsivonen Feb 12, 2026
dabee88
Serialization support for typed tries
hsivonen Feb 12, 2026
86f85a9
Statically assume small trie in the collator
hsivonen Feb 12, 2026
db74341
Hoist performance-sensitive ranges
hsivonen Feb 12, 2026
7f63971
Rearrange fast primary check branches again
hsivonen Feb 13, 2026
3f9da98
Optimize kana, bench Cyrillic tailorings
hsivonen Feb 13, 2026
c433ab7
Use correct data after identical prefix
hsivonen Feb 13, 2026
a78ff8a
Do not optimize two-jamo Hangul syllables over everything else
hsivonen Feb 13, 2026
0848ea6
Add enumeration of collator locales behind ustable flag
hsivonen Feb 19, 2026
3579f23
Use write16 from crates.io
hsivonen Mar 2, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -262,9 +262,9 @@ smallvec = { version = "1.10.0", default-features = false }
stable_deref_trait = { version = "1.2.0", default-features = false }
twox-hash = { version = "2.0.0", default-features = false, features = ["xxhash64"] }
unicode-bidi = { version = "0.3.11", default-features = false }
utf16_iter = { version = "1.0.2", default-features = false }
utf8_iter = { version = "1.0.2", default-features = false }
write16 = { version = "1.0.0", default-features = false }
utf16_iter = { path = "../utf16_iter", default-features = false }
utf8_iter = { path = "../utf8_iter", default-features = false }
write16 = { path = "../write16", default-features = false }

## External Deps Group 2: Heavy Dev and Datagen deps. No default features.
zip = { version = "2", default-features = false }
Expand Down
4 changes: 2 additions & 2 deletions components/collator/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ icu_normalizer = { workspace = true, features = ["utf8_iter", "utf16_iter"] }
icu_locale_core = { workspace = true, features = ["alloc"] }
icu_properties = { workspace = true }
icu_provider = { workspace = true }
utf8_iter = { workspace = true }
utf16_iter = { workspace = true }
utf16_iter = { path = "../../../utf16_iter", features = ["icu_collections"] }
utf8_iter = { path = "../../../utf8_iter", features = ["icu_collections"] }
smallvec = { workspace = true, features = ["union", "const_generics", "const_new"] } # alloc
zerovec = { workspace = true }

Expand Down
33 changes: 20 additions & 13 deletions components/collator/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -159,11 +159,11 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
// Furthermore, CLDR used to default to quaternary for Japanese but now defaults to tertiary
// as for every other language for performance reasons.
let all_strength = [
Strength::Primary,
Strength::Secondary,
// Strength::Primary,
// Strength::Secondary,
Strength::Tertiary,
Strength::Quaternary,
Strength::Identical,
// Strength::Quaternary,
// Strength::Identical,
];
let performance_parameters = [
(
Expand All @@ -172,11 +172,16 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
&all_strength,
),
(
locale!("da-DK"),
vec![&content_latin, &content_photos],
locale!("sv"),
vec![&content_latin, &content_swedish, &content_photos],
&all_strength,
),
(locale!("fr-CA"), vec![&content_latin], &all_strength),
// (
// locale!("da-DK"),
// vec![&content_latin, &content_photos],
// &all_strength,
// ),
// (locale!("fr-CA"), vec![&content_latin], &all_strength),
(
locale!("ja-JP"),
vec![&content_latin, &content_jp_h, &content_jp_k, &content_asian],
Expand All @@ -197,11 +202,13 @@ pub fn collator_with_locale(criterion: &mut Criterion) {
vec![&content_latin, &content_russian],
&all_strength,
),
(
locale!("sv"),
vec![&content_latin, &content_swedish],
&all_strength,
),
// Deliberately using Russian data with Cyrillic tailorings
// to test the perf of the base Cyrillic range.
(locale!("uk"), vec![&content_russian], &all_strength),
(locale!("be"), vec![&content_russian], &all_strength),
(locale!("kk"), vec![&content_russian], &all_strength),
(locale!("ky"), vec![&content_russian], &all_strength),
(locale!("mk"), vec![&content_russian], &all_strength),
(
locale!("th"),
vec![&content_latin, &content_thai],
Expand All @@ -225,7 +232,7 @@ pub fn collator_with_locale(criterion: &mut Criterion) {

for content_under_bench in files_under_bench {
let (file_name, elements) = black_box(content_under_bench);
baseline_bench(&mut group, file_name, elements);
// baseline_bench(&mut group, file_name, elements);

// index to keep order of strength in the html report
for (index, strength) in benched_strength.iter().enumerate() {
Expand Down
Loading