Skip to content

Commit 43883f8

Browse files
committed
perf(core): optimize broken links checker (#9778)
1 parent 47985e7 commit 43883f8

File tree

2 files changed

+174
-66
lines changed

2 files changed

+174
-66
lines changed

packages/docusaurus/src/server/__tests__/brokenLinks.test.ts

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
*/
77

88
import {jest} from '@jest/globals';
9+
import reactRouterConfig from 'react-router-config';
910
import {handleBrokenLinks} from '../brokenLinks';
1011
import type {RouteConfig} from '@docusaurus/types';
1112

@@ -718,4 +719,60 @@ describe('handleBrokenLinks', () => {
718719
"
719720
`);
720721
});
722+
723+
it('is performant and minimize calls to matchRoutes', async () => {
724+
const matchRoutesMock = jest.spyOn(reactRouterConfig, 'matchRoutes');
725+
726+
const scale = 100;
727+
728+
const routes: SimpleRoute[] = [
729+
...Array.from<SimpleRoute>({length: scale}).map((_, i) => ({
730+
path: `/page${i}`,
731+
})),
732+
...Array.from<SimpleRoute>({length: scale}).fill({
733+
path: '/pageDynamic/:subpath1',
734+
}),
735+
];
736+
737+
const collectedLinks: Params['collectedLinks'] = Object.fromEntries(
738+
Array.from<SimpleRoute>({length: scale}).map((_, i) => [
739+
`/page${i}`,
740+
{
741+
links: [
742+
...Array.from<SimpleRoute>({length: scale}).flatMap((_2, j) => [
743+
`/page${j}`,
744+
`/page${j}?age=42`,
745+
`/page${j}#anchor${j}`,
746+
`/page${j}?age=42#anchor${j}`,
747+
`/pageDynamic/subPath${j}`,
748+
`/pageDynamic/subPath${j}?age=42`,
749+
// `/pageDynamic/subPath${j}#anchor${j}`,
750+
// `/pageDynamic/subPath${j}?age=42#anchor${j}`,
751+
]),
752+
],
753+
anchors: Array.from<SimpleRoute>({length: scale}).map(
754+
(_2, j) => `anchor${j}`,
755+
),
756+
},
757+
]),
758+
);
759+
760+
// console.time('testBrokenLinks');
761+
await testBrokenLinks({
762+
routes,
763+
collectedLinks,
764+
});
765+
// console.timeEnd('testBrokenLinks');
766+
767+
// Idiomatic code calling matchRoutes multiple times is not performant
768+
// We try to minimize the calls to this expensive function
769+
// Otherwise large sites will have super long execution times
770+
// See https://github.com/facebook/docusaurus/issues/9754
771+
// See https://twitter.com/sebastienlorber/status/1749392773415858587
772+
// We expect no more matchRoutes calls than number of dynamic route links
773+
expect(matchRoutesMock).toHaveBeenCalledTimes(scale);
774+
// We expect matchRoutes to be called with a reduced number of routes
775+
expect(routes).toHaveLength(scale * 2);
776+
expect(matchRoutesMock.mock.calls[0]![0]).toHaveLength(scale);
777+
});
721778
});

packages/docusaurus/src/server/brokenLinks.ts

Lines changed: 117 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,18 @@
77

88
import _ from 'lodash';
99
import logger from '@docusaurus/logger';
10-
import {matchRoutes} from 'react-router-config';
10+
import {matchRoutes as reactRouterMatchRoutes} from 'react-router-config';
1111
import {parseURLPath, serializeURLPath, type URLPath} from '@docusaurus/utils';
1212
import {getAllFinalRoutes} from './utils';
1313
import type {RouteConfig, ReportingSeverity} from '@docusaurus/types';
1414

15+
function matchRoutes(routeConfig: RouteConfig[], pathname: string) {
16+
// @ts-expect-error: React router types RouteConfig with an actual React
17+
// component, but we load route components with string paths.
18+
// We don't actually access component here, so it's fine.
19+
return reactRouterMatchRoutes(routeConfig, pathname);
20+
}
21+
1522
type BrokenLink = {
1623
link: string;
1724
resolvedLink: string;
@@ -26,88 +33,121 @@ type CollectedLinks = {
2633
[pathname: string]: {links: string[]; anchors: string[]};
2734
};
2835

29-
function getBrokenLinksForPage({
36+
// We use efficient data structures for performance reasons
37+
// See https://github.com/facebook/docusaurus/issues/9754
38+
type CollectedLinksNormalized = Map<
39+
string,
40+
{links: Set<string>; anchors: Set<string>}
41+
>;
42+
43+
type BrokenLinksHelper = {
44+
collectedLinks: CollectedLinksNormalized;
45+
isPathBrokenLink: (linkPath: URLPath) => boolean;
46+
isAnchorBrokenLink: (linkPath: URLPath) => boolean;
47+
};
48+
49+
function createBrokenLinksHelper({
3050
collectedLinks,
31-
pagePath,
32-
pageLinks,
3351
routes,
3452
}: {
35-
collectedLinks: CollectedLinks;
36-
pagePath: string;
37-
pageLinks: string[];
38-
pageAnchors: string[];
53+
collectedLinks: CollectedLinksNormalized;
3954
routes: RouteConfig[];
40-
}): BrokenLink[] {
41-
const allCollectedPaths = new Set(Object.keys(collectedLinks));
55+
}): BrokenLinksHelper {
56+
const validPathnames = new Set(collectedLinks.keys());
57+
58+
// Matching against the route array can be expensive
59+
// If the route is already in the valid pathnames,
60+
// we can avoid matching against it as an optimization
61+
const remainingRoutes = routes.filter(
62+
(route) => !validPathnames.has(route.path),
63+
);
64+
65+
function isPathnameMatchingAnyRoute(pathname: string): boolean {
66+
if (matchRoutes(remainingRoutes, pathname).length > 0) {
67+
// IMPORTANT: this is an optimization here
68+
// See https://github.com/facebook/docusaurus/issues/9754
69+
// Large Docusaurus sites have many routes!
70+
// We try to minimize calls to a possibly expensive matchRoutes function
71+
validPathnames.add(pathname);
72+
return true;
73+
}
74+
75+
return false;
76+
}
4277

4378
function isPathBrokenLink(linkPath: URLPath) {
4479
const pathnames = [linkPath.pathname, decodeURI(linkPath.pathname)];
45-
const matchedRoutes = pathnames
46-
// @ts-expect-error: React router types RouteConfig with an actual React
47-
// component, but we load route components with string paths.
48-
// We don't actually access component here, so it's fine.
49-
.map((l) => matchRoutes(routes, l))
50-
.flat();
51-
// The link path is broken if:
52-
// - it doesn't match any route
53-
// - it doesn't match any collected path
54-
return (
55-
matchedRoutes.length === 0 &&
56-
!pathnames.some((p) => allCollectedPaths.has(p))
57-
);
80+
if (pathnames.some((p) => validPathnames.has(p))) {
81+
return false;
82+
}
83+
if (pathnames.some(isPathnameMatchingAnyRoute)) {
84+
return false;
85+
}
86+
return true;
5887
}
5988

6089
function isAnchorBrokenLink(linkPath: URLPath) {
6190
const {pathname, hash} = linkPath;
62-
6391
// Link has no hash: it can't be a broken anchor link
6492
if (hash === undefined) {
6593
return false;
6694
}
67-
6895
// Link has empty hash ("#", "/page#"...): we do not report it as broken
6996
// Empty hashes are used for various weird reasons, by us and other users...
7097
// See for example: https://github.com/facebook/docusaurus/pull/6003
7198
if (hash === '') {
7299
return false;
73100
}
74-
75101
const targetPage =
76-
collectedLinks[pathname] || collectedLinks[decodeURI(pathname)];
77-
102+
collectedLinks.get(pathname) || collectedLinks.get(decodeURI(pathname));
78103
// link with anchor to a page that does not exist (or did not collect any
79104
// link/anchor) is considered as a broken anchor
80105
if (!targetPage) {
81106
return true;
82107
}
83-
84-
// it's a broken anchor if the target page exists
85-
// but the anchor does not exist on that page
86-
const hashes = [hash, decodeURIComponent(hash)];
87-
return !targetPage.anchors.some((anchor) => hashes.includes(anchor));
108+
// it's a not broken anchor if the anchor exists on the target page
109+
if (
110+
targetPage.anchors.has(hash) ||
111+
targetPage.anchors.has(decodeURIComponent(hash))
112+
) {
113+
return false;
114+
}
115+
return true;
88116
}
89117

90-
const brokenLinks = pageLinks.flatMap((link) => {
118+
return {
119+
collectedLinks,
120+
isPathBrokenLink,
121+
isAnchorBrokenLink,
122+
};
123+
}
124+
125+
function getBrokenLinksForPage({
126+
pagePath,
127+
helper,
128+
}: {
129+
pagePath: string;
130+
helper: BrokenLinksHelper;
131+
}): BrokenLink[] {
132+
const pageData = helper.collectedLinks.get(pagePath)!;
133+
134+
const brokenLinks: BrokenLink[] = [];
135+
136+
pageData.links.forEach((link) => {
91137
const linkPath = parseURLPath(link, pagePath);
92-
if (isPathBrokenLink(linkPath)) {
93-
return [
94-
{
95-
link,
96-
resolvedLink: serializeURLPath(linkPath),
97-
anchor: false,
98-
},
99-
];
100-
}
101-
if (isAnchorBrokenLink(linkPath)) {
102-
return [
103-
{
104-
link,
105-
resolvedLink: serializeURLPath(linkPath),
106-
anchor: true,
107-
},
108-
];
138+
if (helper.isPathBrokenLink(linkPath)) {
139+
brokenLinks.push({
140+
link,
141+
resolvedLink: serializeURLPath(linkPath),
142+
anchor: false,
143+
});
144+
} else if (helper.isAnchorBrokenLink(linkPath)) {
145+
brokenLinks.push({
146+
link,
147+
resolvedLink: serializeURLPath(linkPath),
148+
anchor: true,
149+
});
109150
}
110-
return [];
111151
});
112152

113153
return brokenLinks;
@@ -128,26 +168,30 @@ function getBrokenLinks({
128168
collectedLinks,
129169
routes,
130170
}: {
131-
collectedLinks: CollectedLinks;
171+
collectedLinks: CollectedLinksNormalized;
132172
routes: RouteConfig[];
133173
}): BrokenLinksMap {
134174
const filteredRoutes = filterIntermediateRoutes(routes);
135175

136-
return _.mapValues(collectedLinks, (pageCollectedData, pagePath) => {
176+
const helper = createBrokenLinksHelper({
177+
collectedLinks,
178+
routes: filteredRoutes,
179+
});
180+
181+
const result: BrokenLinksMap = {};
182+
collectedLinks.forEach((_unused, pagePath) => {
137183
try {
138-
return getBrokenLinksForPage({
139-
collectedLinks,
140-
pageLinks: pageCollectedData.links,
141-
pageAnchors: pageCollectedData.anchors,
184+
result[pagePath] = getBrokenLinksForPage({
142185
pagePath,
143-
routes: filteredRoutes,
186+
helper,
144187
});
145188
} catch (e) {
146189
throw new Error(`Unable to get broken links for page ${pagePath}.`, {
147190
cause: e,
148191
});
149192
}
150193
});
194+
return result;
151195
}
152196

153197
function brokenLinkMessage(brokenLink: BrokenLink): string {
@@ -303,15 +347,22 @@ function reportBrokenLinks({
303347
// JS users might call "collectLink(undefined)" for example
304348
// TS users might call "collectAnchor('#hash')" with/without #
305349
// We clean/normalize the collected data to avoid obscure errors being thrown
350+
// We also use optimized data structures for a faster algorithm
306351
function normalizeCollectedLinks(
307352
collectedLinks: CollectedLinks,
308-
): CollectedLinks {
309-
return _.mapValues(collectedLinks, (pageCollectedData) => ({
310-
links: pageCollectedData.links.filter(_.isString),
311-
anchors: pageCollectedData.anchors
312-
.filter(_.isString)
313-
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
314-
}));
353+
): CollectedLinksNormalized {
354+
const result: CollectedLinksNormalized = new Map();
355+
Object.entries(collectedLinks).forEach(([pathname, pageCollectedData]) => {
356+
result.set(pathname, {
357+
links: new Set(pageCollectedData.links.filter(_.isString)),
358+
anchors: new Set(
359+
pageCollectedData.anchors
360+
.filter(_.isString)
361+
.map((anchor) => (anchor.startsWith('#') ? anchor.slice(1) : anchor)),
362+
),
363+
});
364+
});
365+
return result;
315366
}
316367

317368
export async function handleBrokenLinks({

0 commit comments

Comments
 (0)