Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.ttf filter=lfs diff=lfs merge=lfs -text
3 changes: 3 additions & 0 deletions src/assets/fonts/NotoSansCJK-Regular.ttf
Git LFS file not shown
63 changes: 41 additions & 22 deletions src/content/extraction.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import {
extractContent,
type ExtractedContent
} from '../shared/extraction/content-extractor';
import { generatePDF } from '../shared/extraction/pdf-generator';
import { generatePDF, sanitizeFilename } from '../shared/extraction/pdf-generator';
import { registerCJKFont, needsCJKFont, CJK_FONT_FAMILY } from '../shared/extraction/cjk-font';
import { jsPDF } from 'jspdf';

const log = loggers.content;
Expand Down Expand Up @@ -935,6 +936,17 @@ async function generateFallbackPDF(article: { title: string; content: string; te
unit: 'mm',
format: 'a4',
});

let fontFamily = 'helvetica';
if (needsCJKFont((article.title || '') + (article.textContent || '') + (article.content || ''))) {
const hasCJK = await registerCJKFont(pdf);
if (hasCJK) fontFamily = CJK_FONT_FAMILY;
}

const isCJK = fontFamily === CJK_FONT_FAMILY;
const setFont = (style: string) => {
pdf.setFont(fontFamily, isCJK ? 'normal' : style);
};

const pageWidth = pdf.internal.pageSize.getWidth();
const pageHeight = pdf.internal.pageSize.getHeight();
Expand Down Expand Up @@ -976,7 +988,7 @@ async function generateFallbackPDF(article: { title: string; content: string; te
if (alt && alt.length > 5) {
pdf.setFontSize(9);
pdf.setTextColor(100, 100, 100);
pdf.setFont('helvetica', 'italic');
setFont('italic');
const captionLines = pdf.splitTextToSize(alt, contentWidth);
pdf.text(captionLines, margin, yPosition);
yPosition += captionLines.length * 4 + 2;
Expand All @@ -993,20 +1005,20 @@ async function generateFallbackPDF(article: { title: string; content: string; te

pdf.setFontSize(10);
pdf.setTextColor(0, 27, 218);
pdf.setFont('helvetica', 'bold');
setFont('bold');
pdf.text('Filigran XTM Browser Extension', margin, yPosition);
yPosition += 5;

pdf.setFontSize(9);
pdf.setTextColor(100, 100, 100);
pdf.setFont('helvetica', 'normal');
setFont('normal');
pdf.text(`Captured on ${new Date().toLocaleDateString()}`, margin, yPosition);
yPosition += 10;

// Title
pdf.setFontSize(18);
pdf.setTextColor(0, 0, 0);
pdf.setFont('helvetica', 'bold');
setFont('bold');
const titleLines = pdf.splitTextToSize(article.title, contentWidth);
checkPageBreak(titleLines.length * 7);
pdf.text(titleLines, margin, yPosition);
Expand All @@ -1015,7 +1027,7 @@ async function generateFallbackPDF(article: { title: string; content: string; te
// Source URL
pdf.setFontSize(9);
pdf.setTextColor(0, 100, 200);
pdf.setFont('helvetica', 'normal');
setFont('normal');
const truncatedUrl = window.location.href.length > 80
? window.location.href.substring(0, 77) + '...'
: window.location.href;
Expand Down Expand Up @@ -1046,7 +1058,7 @@ async function generateFallbackPDF(article: { title: string; content: string; te
else if (isBold) fontStyle = 'bold';
else if (isItalic) fontStyle = 'italic';

pdf.setFont('helvetica', fontStyle);
setFont(fontStyle);
pdf.setFontSize(fontSize);
pdf.setTextColor(30, 30, 30);

Expand Down Expand Up @@ -1106,7 +1118,7 @@ async function generateFallbackPDF(article: { title: string; content: string; te
const li = listItems[idx];
checkPageBreak(lineHeight);
const bullet = tagName === 'ul' ? '•' : `${idx + 1}.`;
pdf.setFont('helvetica', 'normal');
setFont('normal');
pdf.setFontSize(11);
pdf.setTextColor(30, 30, 30);
pdf.text(bullet, margin, yPosition);
Expand All @@ -1133,7 +1145,7 @@ async function generateFallbackPDF(article: { title: string; content: string; te
const linkText = el.textContent?.trim() || '';
if (linkText && href) {
pdf.setTextColor(0, 100, 200);
pdf.setFont('helvetica', isBold ? 'bold' : 'normal');
setFont(isBold ? 'bold' : 'normal');
pdf.setFontSize(fontSize);
const lines = pdf.splitTextToSize(linkText, contentWidth);
checkPageBreak(lines.length * lineHeight);
Expand All @@ -1154,7 +1166,7 @@ async function generateFallbackPDF(article: { title: string; content: string; te
if (figCaption) {
pdf.setFontSize(9);
pdf.setTextColor(100, 100, 100);
pdf.setFont('helvetica', 'italic');
setFont('italic');
const captionText = figCaption.textContent?.trim() || '';
const captionLines = pdf.splitTextToSize(captionText, contentWidth);
checkPageBreak(captionLines.length * 4);
Expand All @@ -1165,14 +1177,14 @@ async function generateFallbackPDF(article: { title: string; content: string; te
case 'pre':
case 'code':
checkPageBreak(lineHeight);
pdf.setFont('courier', 'normal');
pdf.setFont(isCJK ? fontFamily : 'courier', 'normal');
pdf.setFontSize(9);
pdf.setTextColor(50, 50, 50);
const codeText = el.textContent?.trim() || '';
const codeLines = pdf.splitTextToSize(codeText, contentWidth);
pdf.text(codeLines, margin, yPosition);
yPosition += codeLines.length * 4 + 2;
pdf.setFont('helvetica', 'normal');
setFont('normal');
break;
case 'hr':
checkPageBreak(6);
Expand Down Expand Up @@ -1252,7 +1264,18 @@ async function generateSimpleTextPDF(): Promise<{ data: string; filename: string
unit: 'mm',
format: 'a4',
});


let fontFamily = 'helvetica';
if (needsCJKFont(article.title + textContent)) {
const hasCJK = await registerCJKFont(pdf);
if (hasCJK) fontFamily = CJK_FONT_FAMILY;
}

const isCJK = fontFamily === CJK_FONT_FAMILY;
const setFont = (style: string) => {
pdf.setFont(fontFamily, isCJK ? 'normal' : style);
};

const pageWidth = pdf.internal.pageSize.getWidth();
const pageHeight = pdf.internal.pageSize.getHeight();
const margin = 20;
Expand All @@ -1266,24 +1289,28 @@ async function generateSimpleTextPDF(): Promise<{ data: string; filename: string

pdf.setFontSize(10);
pdf.setTextColor(0, 27, 218);
setFont('bold');
pdf.text('XTM Browser Extension', margin, yPosition);
yPosition += 5;

pdf.setFontSize(9);
pdf.setTextColor(100, 100, 100);
setFont('normal');
pdf.text(`Captured on ${new Date().toLocaleDateString()}`, margin, yPosition);
yPosition += 10;

// Title
pdf.setFontSize(16);
pdf.setTextColor(0, 0, 0);
setFont('bold');
const titleLines = pdf.splitTextToSize(article.title, contentWidth);
pdf.text(titleLines, margin, yPosition);
yPosition += (titleLines.length * 7) + 5;

// Source
pdf.setFontSize(9);
pdf.setTextColor(100, 100, 100);
setFont('normal');
const sourceUrl = window.location.href;
const truncatedUrl = sourceUrl.length > 80 ? sourceUrl.substring(0, 77) + '...' : sourceUrl;
pdf.text(`Source: ${truncatedUrl}`, margin, yPosition);
Expand All @@ -1298,6 +1325,7 @@ async function generateSimpleTextPDF(): Promise<{ data: string; filename: string
// Content
pdf.setFontSize(11);
pdf.setTextColor(30, 30, 30);
setFont('normal');

const paragraphs = textContent.split(/\n\n+/).filter(p => p.trim().length > 0);

Expand Down Expand Up @@ -1338,13 +1366,4 @@ async function generateSimpleTextPDF(): Promise<{ data: string; filename: string
}
}

/**
* Sanitize filename for PDF
*/
export function sanitizeFilename(name: string): string {
return name
.replace(/[<>:"/\\|?*]/g, '')
.replace(/\s+/g, '_')
.substring(0, 100);
}

111 changes: 111 additions & 0 deletions src/shared/extraction/cjk-font.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**
* CJK Font loader for jsPDF
*
* Loads the bundled Noto Sans CJK font on demand and registers it with jsPDF
* so that CJK characters (Japanese, Chinese, Korean) render correctly.
* The font is only loaded when CJK characters are detected in the content.
*/

import { jsPDF } from 'jspdf';
import { loggers } from '../utils/logger';
Comment on lines +9 to +10

const log = loggers.extraction;

const CJK_FONT_FILENAME = 'NotoSansCJK-Regular.ttf';
const CJK_FONT_FAMILY = 'NotoSansCJK';

// BMP ranges: CJK Symbols/Punctuation, Hiragana, Katakana, Bopomofo,
// CJK Extension A, CJK Unified Ideographs, Hangul Syllables/Jamo,
// CJK Compatibility Ideographs.
// Supplementary: CJK Extension B–F, CJK Compatibility Supplement.
const CJK_RE = /[\u3000-\u303F\u3040-\u309F\u30A0-\u30FF\u3100-\u312F\u3400-\u4DBF\u4E00-\u9FFF\uAC00-\uD7AF\uF900-\uFAFF\u1100-\u11FF\u{20000}-\u{2FA1F}]/u;

/**
* Check whether text contains CJK characters that require
* the bundled font to render.
*/
export function needsCJKFont(text: string): boolean {
return CJK_RE.test(text);
}

// Promise-based lock: held only while a fetch is in progress.
// Resets after resolve so the base64 string can be GC'd between PDF generations.
let fontLoadPromise: Promise<string | null> | null = null;

/**
* Load the CJK font file from extension assets and return as base64.
* Uses a promise lock to prevent duplicate concurrent fetches.
* The result is NOT cached long-term to avoid holding ~25 MB in memory.
*/
function loadCJKFontBase64(): Promise<string | null> {
if (fontLoadPromise) return fontLoadPromise;

fontLoadPromise = (async () => {
try {
const fontUrl = chrome.runtime.getURL(`assets/fonts/${CJK_FONT_FILENAME}`);
const response = await fetch(fontUrl);
if (!response.ok) {
log.warn('[CJKFont] Failed to fetch font:', response.status);
return null;
}

const buffer = await response.arrayBuffer();
const bytes = new Uint8Array(buffer);

// Chunk-based binary→string conversion.
// Uses a pre-allocated array + join instead of spread to avoid
// hitting engine-specific call-stack argument limits.
const chunkSize = 8192;
const chunks: string[] = [];
for (let i = 0; i < bytes.length; i += chunkSize) {
const end = Math.min(i + chunkSize, bytes.length);
const charCodes = new Array<string>(end - i);
for (let j = i; j < end; j++) {
charCodes[j - i] = String.fromCharCode(bytes[j]);
}
chunks.push(charCodes.join(''));
}
const base64 = btoa(chunks.join(''));

log.debug('[CJKFont] Font loaded, size:', bytes.length);
return base64;
} catch (error) {
log.warn('[CJKFont] Error loading font:', error);
return null;
}
})().finally(() => {
// Release the lock so the resolved base64 string can be GC'd
// once registerCJKFont is done with it.
fontLoadPromise = null;
});

return fontLoadPromise;
}

/**
* Register the CJK font with a jsPDF instance.
* Loads the font, validates it, and registers for the 'normal' style.
* Bold/italic calls will fall back to normal automatically in jsPDF.
*/
export async function registerCJKFont(pdf: jsPDF): Promise<boolean> {
try {
const fontBase64 = await loadCJKFontBase64();
if (!fontBase64) return false;

pdf.addFileToVFS(CJK_FONT_FILENAME, fontBase64);
pdf.addFont(CJK_FONT_FILENAME, CJK_FONT_FAMILY, 'normal');

// Verify the font is usable by attempting to set it
pdf.setFont(CJK_FONT_FAMILY, 'normal');

return true;
} catch (error) {
log.warn('[CJKFont] Failed to register font (file may be corrupt):', error);
return false;
}
}

/**
* The font family name to use after registration.
*/
export { CJK_FONT_FAMILY };
Loading
Loading