Skip to content
4 changes: 4 additions & 0 deletions apps/trustlab/next.config.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ const nextConfig = {
source: "/robots.txt",
destination: "/api/v1/robots",
},
{
source: "/sitemap.xml",
destination: "/api/v1/sitemap",
},
];
},
};
Expand Down
156 changes: 156 additions & 0 deletions apps/trustlab/src/lib/data/common/sitemap.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
import * as Sentry from "@sentry/nextjs";

import { site } from "@/trustlab/utils";

function normalizePathname(pathname) {
if (!pathname || typeof pathname !== "string") {
return null;
}

if (pathname === "/") {
return pathname;
}

const trimmed = pathname.trim();
if (!trimmed) {
return null;
}

const withLeadingSlash = trimmed.startsWith("/") ? trimmed : `/${trimmed}`;
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I don't think we need path normalisation since all collection items that appear as pages on the front-end, already have pathname attribute, and it should be normalised already (please check the code that generates them and lets fix that if broken)
  2. We seem to be removing trailing / from site.url but here we're prepending / to pathname. It seems counter-productive.

return withLeadingSlash.replace(/\/+$/, "");
}

function getAbsoluteUrl(pathname) {
const normalizedPathname = normalizePathname(pathname);
if (!normalizedPathname) {
return null;
}

const siteUrl = site.url.replace(/\/+$/, "");
return `${siteUrl}${normalizedPathname}`;
}

function getLastModified(doc) {
const rawDate = doc?.updatedAt || doc?.createdAt;
if (!rawDate) {
return null;
}

const parsedDate = new Date(rawDate);
if (Number.isNaN(parsedDate.getTime())) {
return null;
}

return parsedDate.toISOString();
Comment on lines +38 to +44
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets also remove these unnecessary empty lines; if it's from the agent, please review and fix it before creating PRs.

}

function toSitemapEntry(doc, pathname) {
const url = getAbsoluteUrl(pathname);
if (!url) {
return null;
}

return {
url,
lastModified: getLastModified(doc),
};
}

function warnOnMissingPathname(collection, doc) {
Sentry.logger.warn(`Collection item without \`pathname\` in sitemap`, {
collection,
slug: doc?.slug,
});
}

async function getPagesEntries(api) {
const { docs } = await api.getCollection("pages", {
Comment thread
kelvinkipruto marked this conversation as resolved.
pagination: false,
select: {
pathname: true,
slug: true,
parent: true,
breadcrumbs: true,
updatedAt: true,
createdAt: true,
},
where: {
and: [
{
_status: {
equals: "published",
},
},
{
slug: {
not_in: ["404", "500"],
},
},
],
},
});

return docs
.map((doc) => {
if (!doc?.pathname) {
warnOnMissingPathname("pages", doc);
return null;
}

return toSitemapEntry(doc, doc.pathname);
})
.filter(Boolean);
}

async function getOpportunitiesEntries(api) {
const { docs } = await api.getCollection("opportunities", {
pagination: false,
select: {
pathname: true,
slug: true,
type: true,
updatedAt: true,
createdAt: true,
date: true,
},
});

return docs
.map((doc) => {
if (!doc?.pathname) {
warnOnMissingPathname("opportunities", doc);
return null;
}

return toSitemapEntry(doc, doc.pathname);
})
.filter(Boolean);
}

async function getSitemapEntries(api) {
const [pages, opportunities] = await Promise.all([
getPagesEntries(api),
getOpportunitiesEntries(api),
]);

return [...pages, ...opportunities].sort((left, right) =>
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. I don't think we need this expensive sorting.
  2. If we want to sort, may be sorting by last modified, is more meaningful to see what has changed on the site.

left.url.localeCompare(right.url),
Comment on lines +136 to +137
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Deduplicate merged sitemap entries by URL

Merging pages and opportunities with [...] can emit duplicate <url> nodes when two documents resolve to the same pathname (slug uniqueness is enforced per collection, not across collections). In that case /sitemap.xml contains repeated locations with potentially conflicting lastmod values, which weakens crawl signals and can cause crawlers to ignore one of the entries. Add a URL-based dedup step before sorting/serializing.

Useful? React with 👍 / 👎.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, slugs could be duplicate but pathname, which is what we're interested in, will always be different i.e it's the URL the items appear on the frontend and two items can't have the same URL regardless of collection.

);
}

async function buildSitemapXml(api) {
const entries = await getSitemapEntries(api);
const xmlEntries = entries
.map(({ url, lastModified }) => {
const lastModifiedNode = lastModified
? `\n <lastmod>${lastModified}</lastmod>`
: "";

return ` <url>\n <loc>${url}</loc>${lastModifiedNode}\n </url>`;
})
.join("\n");

return `<?xml version="1.0" encoding="UTF-8"?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n${xmlEntries}\n</urlset>\n`;
Comment thread
kelvinkipruto marked this conversation as resolved.
}

export default buildSitemapXml;
7 changes: 6 additions & 1 deletion apps/trustlab/src/lib/data/index.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
export { getPageStaticPaths, getPageStaticProps, getRobotsTxt } from "./local";
export {
getPageStaticPaths,
getPageStaticProps,
getRobotsTxt,
getSitemapXml,
} from "./local";

export default undefined;
5 changes: 5 additions & 0 deletions apps/trustlab/src/lib/data/local/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { getPageProps, getPagePaths } from "@/trustlab/lib/data/common";
import buildSitemapXml from "@/trustlab/lib/data/common/sitemap";
import api from "@/trustlab/lib/payload";

export async function getPageStaticPaths() {
Expand All @@ -21,4 +22,8 @@ export async function getRobotsTxt() {
return siteSettings?.robotsTxt;
}

export async function getSitemapXml() {
return buildSitemapXml(api);
}

export default undefined;
24 changes: 24 additions & 0 deletions apps/trustlab/src/pages/api/v1/sitemap.page.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import * as Sentry from "@sentry/nextjs";

import { getSitemapXml } from "@/trustlab/lib/data";

export default async function handler(req, res) {
if (req.method !== "GET") {
res.setHeader("Allow", "GET");
res.status(405).end();
return;
}

try {
const sitemapXml = await getSitemapXml();
res.setHeader(
"Cache-Control",
"public, max-age=3600, stale-while-revalidate=86400",
);
res.setHeader("Content-Type", "application/xml; charset=utf-8");
res.send(sitemapXml);
} catch (error) {
Sentry.captureException(error);
res.status(500).end();
}
}
Loading