xref: /aosp_15_r20/external/cldr/tools/scripts/tr-archive/extract-link-targets.js (revision 912701f9769bb47905792267661f0baf2b85bed5)
1const fs = require("fs").promises;
2const jsdom = require("jsdom");
3const { JSDOM } = jsdom;
4const path = require("path");
5
6/**
7 * Run this after outputting html into 'dist'
8 * It will update ../../../docs/ldml/*.anchors.json
9 * Use source control to see if the links have changed.
10 */
11
12// We would ideally run marked and process the output here.
13// But that might introduce duplicate code.
14const DONE_ICON = "✅";
15const GEAR_ICON = "⚙️";
16const NONE_ICON = "∅";
17const PACKAGE_ICON = "��";
18const SECTION_ICON = "��";
19const TYPE_ICON = "��";
20const WARN_ICON = "⚠️";
21const POINT_ICON = "��";
22const MISSING_ICON = "❌";
23
24/**
25 *
26 * @param {string} targetSection e.g. 'tr35-info'
27 * @param {string} anchor e.g. 'Parts'
28 * @returns 'tr35-info.md#Parts'
29 */
30function constructLink(targetSection, anchor) {
31  const page = `${targetSection}.md`;
32  if (!anchor) {
33    return page;
34  }
35  return `${page}#${anchor}`;
36}
37
38/**
39 * Read the input .md file, and write to a corresponding .html file
40 * @param {string} infile path to input file
41 * @returns {Promise<string>} name of output file (for status update)
42 */
43async function extractAnchors(infile) {
44  const basename = path.basename(infile, ".html");
45  dirname = '../../../docs/ldml';
46  console.log(`${SECTION_ICON} Reading ${infile}`);
47  let f1 = await fs.readFile(infile, "utf-8");
48
49  // oh the irony of removing a BOM before posting to unicode.org
50  if (f1.charCodeAt(0) == 0xfeff) {
51    f1 = f1.substring(3);
52  }
53
54  const rawHtml = f1;
55
56  // now fix. Spin up a JSDOM so we can manipulate
57  const dom = new JSDOM(rawHtml);
58  const document = dom.window.document;
59
60  const anchors = new Set();
61  const targets = new Set();
62
63  function addAnchor(n) {
64    if (!n) return;
65    if (anchors.has(n)) {
66      console.error(`${WARN_ICON} ${constructLink(basename)}: Duplicate anchor: #${n}`);
67    } else {
68      anchors.add(n);
69    }
70  }
71
72  function addTarget(href) {
73    const INTRA_PAGE_LINK = /^#(.*)$/; // starts with #  => 1=anchor
74    const TR_SECTION_LINK = /^(tr35(?:[^.]*)).html(?:#(.*)){0,1}$/; // => 1=basename, 2=anchor
75    const EXTERNAL_LINK = /^(http|https|mailto|ftp):.*$/; // scheme
76    // Error on all other links
77
78    const intra_page = INTRA_PAGE_LINK.exec(href);
79    const tr_section = TR_SECTION_LINK.exec(href);
80    const external   = EXTERNAL_LINK.exec(href);
81    if (intra_page) {
82      // same page
83      targets.add(constructLink(basename, intra_page[1]));
84    } else if (tr_section) {
85      // another page
86      targets.add(constructLink(tr_section[1], tr_section[2]));
87    } else if (external) {
88      // external
89      // Do nothing
90      // TODO: add to list of external links?
91    } else {
92      console.error(`${WARN_ICON} ${basename}: Unknown anchor: ${href}`);
93    }
94  }
95
96  // extract anchors
97  for (const a of dom.window.document.getElementsByTagName("*")) {
98    const id = a.getAttribute("id");
99    addAnchor(id);
100
101    if (a.tagName === 'A') {
102      const name = a.getAttribute("name");
103      addAnchor(name);
104    }
105  }
106  // extract targets
107  for (const a of dom.window.document.getElementsByTagName("A")) {
108    const href = a.getAttribute("href");
109    if (href) {
110      addTarget(href);
111    }
112  }
113
114  const coll = new Intl.Collator(['und']);
115  const anchorList = Array.from(anchors.values()).sort(coll.compare);
116  const anchorFile = path.join(dirname, `${basename}.anchors.json`);
117  await fs.writeFile(anchorFile, JSON.stringify(anchorList, null, '  '));
118  const targetList = Array.from(targets.values()).sort(coll.compare);
119  return [basename, anchorList, targetList];
120}
121
122/**
123 * Convert all files
124 * @returns Promise list of output files
125 */
126async function extractAll() {
127  outbox = "./dist";
128
129  const fileList = (await fs.readdir(outbox))
130    .filter((f) => /\.html$/.test(f))
131    .map((f) => path.join(outbox, f));
132  return Promise.all(fileList.map(extractAnchors));
133}
134
135async function checkAll() {
136  console.log(`${GEAR_ICON} Reading HTML`);
137  const checked = await extractAll();
138  console.log(`${GEAR_ICON} Collecting internal links`);
139
140  const allInternalTargets = new Set();
141  const allInternalAnchors = new Set();
142  const sectionToTargets = {
143    // e.g.  "tr35-info" : Set(["tr35-keyboards.md#Element_keyboard", …])
144  };
145  checked.forEach(([sourceSection,anchorList,targetList]) => {
146    allInternalAnchors.add(constructLink(sourceSection)); // example: 'tr35-collation.md'
147    targetList.forEach(target => allInternalTargets.add(target));
148    sectionToTargets[sourceSection] = new Set(targetList); // for error checking
149    const myInternalAnchors = anchorList.map(anchor => constructLink(sourceSection, anchor));
150    myInternalAnchors.forEach(anchor => allInternalAnchors.add(anchor)); // tr35-collation.md#Parts
151  });
152
153  console.log(`${GEAR_ICON} Checking ${allInternalTargets.size} internal links against ${allInternalAnchors.size} anchors`);
154
155  const missingInternalLinks = new Set();
156
157  for (const expectedAnchor of allInternalTargets.values()) {
158    if (!allInternalAnchors.has(expectedAnchor)) {
159      missingInternalLinks.add(expectedAnchor);
160    }
161  }
162
163  if (!!missingInternalLinks.size) {
164    for (expectedAnchor of missingInternalLinks.values()) {
165      // coalesce
166      const sourceSections = ((Object.entries(sectionToTargets)
167        .filter(([section,s]) => s.has(expectedAnchor))) // Does this section target this anchor?
168        .map(([section]) => constructLink(section)) // drop the set
169        .join(' & ') // join section name(s)
170      ) || '(unknown section(s))'; // error
171      console.error(`${MISSING_ICON} Broken internal link: ${sourceSections}: (${expectedAnchor})`);
172    }
173    console.error(`${WARN_ICON} ${missingInternalLinks.size} missing links.`);
174    process.exitCode = 1;
175  }
176
177  console.log(`${POINT_ICON} use: 'lychee --cache docs/ldml' to check external links`);
178
179  return checked.map(([anchorFile]) => anchorFile);
180}
181checkAll().then(
182  (x) => x.forEach(section => {
183    console.log(`${DONE_ICON} ${constructLink(section)}`);
184  }),
185  (e) => {
186    console.error(e);
187    process.exitCode = 1;
188  }
189);
190