Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions ext/js/language/ja/japanese-text-preprocessors.js
Original file line number Diff line number Diff line change
Expand Up @@ -95,3 +95,29 @@ export const standardizeKanji = {
description: '萬 → 万',
process: (str) => [str, convertVariants(str)],
};

const WILDCARD_MAX_VARIANTS = 51;
const WILDCARD_CHAR = '~'; // U+FF5E fullwidth tilde

/** @type {import('language').TextProcessor} */
export const insertWildcard = {
name: 'Insert wildcard for grammar patterns',
description: 'いくら騒いでも → いくら~でも',
process: (str) => {
const chars = [...str];
const n = chars.length;
if (n < 3) { return [str]; }

/** @type {string[]} */
const results = [str];
for (let prefixLen = 1; prefixLen < n; prefixLen++) {
for (let suffixLen = 1; suffixLen < n - prefixLen; suffixLen++) {
const prefix = chars.slice(0, prefixLen).join('');
const suffix = chars.slice(n - suffixLen).join('');
results.push(prefix + WILDCARD_CHAR + suffix);
if (results.length >= WILDCARD_MAX_VARIANTS) { return results; }
}
}
return results;
},
};
2 changes: 2 additions & 0 deletions ext/js/language/language-descriptors.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import {
collapseEmphaticSequences,
convertHalfWidthCharacters,
convertHiraganaToKatakana,
insertWildcard,
normalizeCJKCompatibilityCharacters,
normalizeCombiningCharacters,
standardizeKanji,
Expand Down Expand Up @@ -355,6 +356,7 @@ const languageDescriptors = [
convertHiraganaToKatakana,
collapseEmphaticSequences,
standardizeKanji,
insertWildcard,
},
languageTransforms: japaneseTransforms,
},
Expand Down
91 changes: 91 additions & 0 deletions test/language/japanese-text-preprocessors.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright (C) 2024-2026 Yomitan Authors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/

import {describe, expect, test} from 'vitest';
import {insertWildcard} from '../../ext/js/language/ja/japanese-text-preprocessors.js';

const {process} = insertWildcard;

describe('insertWildcard', () => {
test('short input (1 char) returns only original', () => {
expect(process('あ')).toStrictEqual(['あ']);
});

test('short input (2 chars) returns only original', () => {
expect(process('あい')).toStrictEqual(['あい']);
});

test('3-char input produces 1 wildcard variant', () => {
const variants = process('あいう');
expect(variants).toContain('あいう');
expect(variants).toContain('あ~う');
expect(variants).toHaveLength(2);
});

test('いくら騒いでも produces いくら~でも', () => {
const variants = process('いくら騒いでも');
expect(variants).toContain('いくら騒いでも');
expect(variants).toContain('いくら~でも');
});

test('single-char prefix works for ば~ほど pattern', () => {
const variants = process('ば食べるほど');
expect(variants).toContain('ば食べるほど');
expect(variants).toContain('ば~ほど');
});

test('しか~ない pattern works', () => {
const variants = process('しか言わない');
expect(variants).toContain('しか言わない');
expect(variants).toContain('しか~ない');
});

test('wildcard character is fullwidth tilde U+FF5E', () => {
const variants = process('あいう');
const wildcardVariant = variants.find((v) => v !== 'あいう');
expect(wildcardVariant).toBe('あ\uFF5Eう');
});

test('variant count for 5-char input', () => {
// n=5: prefixLen 1..3, for each suffixLen 1..(n-prefixLen-1)
// p=1: s=1,2,3 (3); p=2: s=1,2 (2); p=3: s=1 (1) = 6 variants + original
const variants = process('あいうえお');
expect(variants).toHaveLength(7);
});

test('variant count for 7-char input', () => {
// (n-1)(n-2)/2 = 6*5/2 = 15 variants + original
const variants = process('あいうえおかき');
expect(variants).toHaveLength(16);
});

test('variants are capped at 51 for long inputs', () => {
const longStr = 'あいうえおかきくけこさしすせそ'; // 15 chars
const variants = process(longStr);
expect(variants).toHaveLength(51);
});

test('empty string returns only original', () => {
expect(process('')).toStrictEqual(['']);
});

test('original string is always first', () => {
const input = 'いくら騒いでも';
const variants = process(input);
expect(variants[0]).toBe(input);
});
});
1 change: 1 addition & 0 deletions types/ext/language-descriptors.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ type AllTextProcessors = {
convertHiraganaToKatakana: TextProcessor;
collapseEmphaticSequences: TextProcessor;
standardizeKanji: TextProcessor;
insertWildcard: TextProcessor;
};
};
ka: Record<string, never>;
Expand Down
Loading