Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion src/processor.js
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,37 @@ export async function fetchXArticleContent(articleUrl, config, sourceTweetId = n
}
}

/**
* Detects when a URL is likely a filename that Twitter auto-linked to a
* Moldova (.md) ccTLD domain, rather than an intentional URL.
*
* @param {string} expandedUrl - Fully expanded URL (after t.co redirect)
* @returns {boolean}
*/
export function isLikelyMdFilename(expandedUrl) {
let url;
try {
url = new URL(expandedUrl);
} catch {
return false;
}

if (!url.hostname.endsWith('.md')) return false;

// Allowlist: obsidian.md is the only known legitimate .md domain in tech Twitter
if (url.hostname === 'obsidian.md' || url.hostname.endsWith('.obsidian.md')) return false;

// Subdomained .md domains (e.g., www.plan.md) are more likely real websites
if (url.hostname.split('.').length > 2) return false;

// URLs with a meaningful path are likely intentional links to real pages
if (url.pathname.replace(/\/$/, '').length > 0) return false;

// Bare .md root domain — overwhelmingly an auto-linked filename in tech Twitter
// TODO: ALL_CAPS detection needs bird CLI's entities.urls[].display_url
return true;
}

// Sites that typically require paywall bypass
const PAYWALL_DOMAINS = [
'nytimes.com',
Expand Down Expand Up @@ -684,10 +715,14 @@ export async function fetchAndPrepareBookmarks(options = {}) {
for (const { original: link, expanded } of expandedResults) {
console.log(` Expanded: ${link} -> ${expanded}`);

// Setting type based on URL patterns - this can be expanded with more patterns as needed
let type = 'unknown';
let content = null;

if (expanded.includes('github.com')) {
if (isLikelyMdFilename(expanded)) {
type = 'filename-reference';
console.log(` Skipping .md filename auto-link: ${expanded}`);
} else if (expanded.includes('github.com')) {
type = 'github';
} else if (expanded.includes('youtube.com') || expanded.includes('youtu.be')) {
type = 'video';
Expand Down
64 changes: 63 additions & 1 deletion test/processor.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import fs from 'fs';
import path from 'path';
import { execSync } from 'child_process';
import { fileURLToPath } from 'url';
import { isPaywalled, stripQuerystring, fetchXArticleContent } from '../src/processor.js';
import { isPaywalled, stripQuerystring, fetchXArticleContent, isLikelyMdFilename } from '../src/processor.js';

const __dirname = path.dirname(fileURLToPath(import.meta.url));

Expand Down Expand Up @@ -621,3 +621,65 @@ describe('X article integration tests (requires bird credentials)', { skip: !BIR
assert.strictEqual(result.articleId, '9999999999999999999');
});
});

describe('isLikelyMdFilename', () => {
describe('filters bare .md root domains', () => {
test('known dev filenames are detected', () => {
for (const name of ['claude', 'readme', 'plan', 'todo', 'memory']) {
assert.strictEqual(isLikelyMdFilename(`https://${name}.md/`), true, `${name}.md should be filtered`);
}
});

test('unknown bare .md domains are also filtered', () => {
assert.strictEqual(isLikelyMdFilename('https://banana.md/'), true);
assert.strictEqual(isLikelyMdFilename('https://xyz.md'), true);
});

test('URL spec lowercases hostnames before function runs', () => {
assert.strictEqual(isLikelyMdFilename('https://CLAUDE.md/'), true);
});

test('query params on root path are still filtered', () => {
assert.strictEqual(isLikelyMdFilename('https://plan.md/?utm_source=twitter'), true);
});

test('hash fragments on root path are still filtered', () => {
assert.strictEqual(isLikelyMdFilename('https://plan.md/#section'), true);
});

test('http:// works same as https://', () => {
assert.strictEqual(isLikelyMdFilename('http://plan.md/'), true);
});
});

describe('does not filter legitimate .md URLs', () => {
test('allows obsidian.md and subdomains', () => {
assert.strictEqual(isLikelyMdFilename('https://obsidian.md/'), false);
assert.strictEqual(isLikelyMdFilename('https://help.obsidian.md/Getting+Started'), false);
assert.strictEqual(isLikelyMdFilename('https://forum.obsidian.md/t/some-thread'), false);
});

test('does not filter subdomained .md domains', () => {
assert.strictEqual(isLikelyMdFilename('https://www.plan.md/'), false);
assert.strictEqual(isLikelyMdFilename('https://app.something.md/'), false);
});

test('does not filter .md domains with a path', () => {
assert.strictEqual(isLikelyMdFilename('https://plan.md/about/us'), false);
assert.strictEqual(isLikelyMdFilename('https://some.md/blog/post-123'), false);
assert.strictEqual(isLikelyMdFilename('https://plan.md/pricing?ref=123'), false);
});
});

describe('ignores non-.md URLs', () => {
test('non-.md domains return false', () => {
for (const url of ['https://github.com/user/repo', 'https://example.com/', 'https://x.com/user/status/123']) {
assert.strictEqual(isLikelyMdFilename(url), false, `${url} should not be filtered`);
}
});

test('handles invalid URLs gracefully', () => {
assert.strictEqual(isLikelyMdFilename('not-a-url'), false);
});
});
});