wip: improve spec parsing

This commit is contained in:
2026-01-10 21:52:45 +00:00
parent f1fa264ed7
commit 208219ca2c
10 changed files with 324 additions and 255 deletions

View File

@@ -1,8 +1,14 @@
/**
* Parses rendered spec HTML into structured sections for the single-page
* layout.
* Parses spec content using markdown AST for robust section extraction.
*/
import { unified } from "unified";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import rehypeStringify from "rehype-stringify";
import type { Root, RootContent, Heading, List, ListItem } from "mdast";
import type { Root as HastRoot } from "hast";
export interface TocItem {
id: string;
title: string;
@@ -26,7 +32,9 @@ export interface ParsedSpec {
introduction: string;
summary: string;
terminology: string;
terminologyTitle: string;
specification: string;
specificationTitle: string;
specSections: SpecSection[];
faq: FAQItem[];
about: string;
@@ -35,7 +43,7 @@ export interface ParsedSpec {
}
/**
* Convert a heading text to a URL-friendly ID
* Convert text to a URL-friendly ID
*/
function slugify(text: string): string {
return text
@@ -45,160 +53,212 @@ function slugify(text: string): string {
.trim();
}
type MdastNode = Root | RootContent;
/**
* Extract content between two headings or to the end of the document
* Extract plain text from an mdast node tree
*/
function extractSection(
html: string,
startHeading: string,
endHeadings: string[] = []
): string {
// Find the heading (h2) - use partial match to handle additional text
// e.g., "Git Common-Flow Specification (Common-Flow)"
const headingPattern = new RegExp(
`<h2[^>]*>[^<]*${escapeRegex(startHeading)}[^<]*</h2>`,
"i"
function extractText(node: MdastNode): string {
if ("value" in node && typeof node.value === "string") {
return node.value;
}
if ("children" in node && Array.isArray(node.children)) {
return node.children.map((child) => extractText(child)).join("");
}
return "";
}
/**
* Find index of heading containing specific text
*/
function findHeadingIndex(
nodes: RootContent[],
text: string,
depth: number = 2
): number {
return nodes.findIndex(
(node) =>
node.type === "heading" &&
(node as Heading).depth === depth &&
extractText(node).toLowerCase().includes(text.toLowerCase())
);
const match = html.match(headingPattern);
if (!match || match.index === undefined) return "";
const startIdx = match.index + match[0].length;
// Find the next section heading
let endIdx = html.length;
for (const endHeading of endHeadings) {
const endPattern = new RegExp(
`<h2[^>]*>\\s*${escapeRegex(endHeading)}\\s*</h2>`,
"i"
);
const endMatch = html.slice(startIdx).match(endPattern);
if (endMatch && endMatch.index !== undefined) {
const possibleEnd = startIdx + endMatch.index;
if (possibleEnd < endIdx) {
endIdx = possibleEnd;
}
}
}
// Also check for any h2 as a fallback
const anyH2 = html.slice(startIdx).match(/<h2[^>]*>/i);
if (anyH2 && anyH2.index !== undefined) {
const possibleEnd = startIdx + anyH2.index;
if (possibleEnd < endIdx) {
endIdx = possibleEnd;
}
}
return html.slice(startIdx, endIdx).trim();
}
function escapeRegex(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
// Spec section titles in order (used for both ToC and anchor injection)
const SPEC_SECTION_TITLES = [
"TL;DR",
"The Master Branch",
"Change Branches",
"Pull Requests",
"Versioning",
"Releases",
"Short-Term Release Branches",
"Long-term Release Branches",
"Bug Fixes & Rollback",
"Git Best Practices",
];
/**
* Extract the numbered spec sections (1. TL;DR, 2. The Master Branch, etc.)
* Extract nodes between two headings
*/
function extractSpecSections(specContent: string): SpecSection[] {
function extractSectionNodes(
nodes: RootContent[],
startText: string,
depth: number = 2
): RootContent[] {
const startIdx = findHeadingIndex(nodes, startText, depth);
if (startIdx === -1) return [];
// Find the next heading of same or higher level
let endIdx = nodes.length;
for (let i = startIdx + 1; i < nodes.length; i++) {
const node = nodes[i];
if (node.type === "heading" && (node as Heading).depth <= depth) {
endIdx = i;
break;
}
}
// Return nodes after the heading (not including the heading itself)
return nodes.slice(startIdx + 1, endIdx);
}
/**
* Get the full heading text
*/
function getHeadingText(
nodes: RootContent[],
text: string,
depth: number = 2
): string {
const idx = findHeadingIndex(nodes, text, depth);
if (idx === -1) return text;
return extractText(nodes[idx]);
}
/**
* Convert mdast nodes to HTML string
*/
async function nodesToHtml(nodes: RootContent[]): Promise<string> {
if (nodes.length === 0) return "";
// Create a root node with these children
const root: Root = { type: "root", children: nodes };
const result = await unified()
.use(remarkRehype, { allowDangerousHtml: true })
.use(rehypeStringify, { allowDangerousHtml: true })
.run(root);
const html = await unified()
.use(rehypeStringify, { allowDangerousHtml: true })
.stringify(result as HastRoot);
return html;
}
/**
* Extract top-level list item titles from an ordered list
*/
function extractListItemTitles(list: List): string[] {
const titles: string[] = [];
for (const item of list.children) {
if (item.type !== "listItem") continue;
// Get the first paragraph or text content of the list item
// The title is the text before any nested list
let title = "";
for (const child of item.children) {
if (child.type === "list") break; // Stop at nested list
if (child.type === "paragraph") {
title = extractText(child);
break;
}
// Handle inline text directly in list item
title += extractText(child);
}
title = title.split("\n")[0].trim();
if (title) {
titles.push(title);
}
}
return titles;
}
/**
* Find the first ordered list in nodes and extract its structure
*/
function findSpecSections(nodes: RootContent[]): SpecSection[] {
const sections: SpecSection[] = [];
// The spec uses an ordered list with nested items
// Each top-level li starts a new section
const olMatch = specContent.match(/<ol[^>]*>([\s\S]*?)<\/ol>/i);
if (!olMatch) return sections;
// Find each section by looking for the title pattern
for (const title of SPEC_SECTION_TITLES) {
const id = slugify(title);
// For the content, we'll just use the title for navigation
// The actual content stays in the main specification block
sections.push({
id: `spec-${id}`,
title,
content: "", // Content handled inline
});
for (const node of nodes) {
if (node.type === "list" && (node as List).ordered) {
const titles = extractListItemTitles(node as List);
for (const title of titles) {
sections.push({
id: `spec-${slugify(title)}`,
title,
content: "",
});
}
break; // Only process first ordered list
}
}
return sections;
}
/**
* Add anchor IDs to spec section list items.
* Finds top-level <li> elements that start with section titles and adds IDs.
* Add anchor IDs to list items in the spec ordered list
*/
function addSpecSectionAnchors(specContent: string): string {
let result = specContent;
function addAnchorsToList(list: List, sections: SpecSection[]): void {
const titleMap = new Map(sections.map((s) => [s.title, s.id]));
for (const title of SPEC_SECTION_TITLES) {
const id = `spec-${slugify(title)}`;
// Match <li> followed by the section title (possibly with whitespace)
// The title appears right after <li> in the rendered HTML
const pattern = new RegExp(
`(<li>)(\\s*${escapeRegex(title)})`,
"i"
);
result = result.replace(pattern, `<li id="${id}">$2`);
for (const item of list.children) {
if (item.type !== "listItem") continue;
// Get the title of this item
let title = "";
for (const child of item.children) {
if (child.type === "list") break;
if (child.type === "paragraph") {
title = extractText(child).split("\n")[0].trim();
break;
}
title += extractText(child);
}
title = title.split("\n")[0].trim();
// Add ID as data attribute (will be processed by rehype)
const id = titleMap.get(title);
if (id) {
// Add hProperties for rehype to convert to HTML id attribute
(item as ListItem & { data?: { hProperties?: { id?: string } } }).data = {
hProperties: { id },
};
}
}
return result;
}
/**
* Extract FAQ items from the FAQ section HTML
* Extract FAQ items from FAQ section nodes
*/
function extractFAQItems(faqContent: string): FAQItem[] {
function extractFAQFromNodes(nodes: RootContent[]): FAQItem[] {
const items: FAQItem[] = [];
let currentQuestion = "";
let currentId = "";
// Split by h3 headings
const h3Pattern = /<h3[^>]*>([\s\S]*?)<\/h3>/gi;
let lastIndex = 0;
let lastQuestion = "";
let lastId = "";
for (const node of nodes) {
if (node.type === "heading" && (node as Heading).depth === 3) {
// Save previous FAQ item if we had one
if (currentQuestion) {
items.push({
id: currentId,
question: currentQuestion,
answer: "", // Placeholder, will be filled later
});
}
const matches = [...faqContent.matchAll(h3Pattern)];
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const question = match[1].replace(/<[^>]+>/g, "").trim();
const id = slugify(question).slice(0, 50);
if (i > 0 && match.index !== undefined) {
// Get content between previous h3 and this one
const answer = faqContent.slice(lastIndex, match.index).trim();
items.push({
id: `faq-${lastId}`,
question: lastQuestion,
answer,
});
currentQuestion = extractText(node);
currentId = `faq-${slugify(currentQuestion).slice(0, 50)}`;
}
lastQuestion = question;
lastId = id;
lastIndex = match.index! + match[0].length;
}
// Don't forget the last FAQ item
if (lastQuestion) {
const answer = faqContent.slice(lastIndex).trim();
// Don't forget the last item
if (currentQuestion) {
items.push({
id: `faq-${lastId}`,
question: lastQuestion,
answer,
id: currentId,
question: currentQuestion,
answer: "",
});
}
@@ -206,20 +266,25 @@ function extractFAQItems(faqContent: string): FAQItem[] {
}
/**
* Build table of contents from parsed sections.
* Only includes sections rendered in SpecSection (Terminology + Specification).
* Introduction/Summary are in AboutSection and excluded from this ToC.
* Build table of contents from parsed sections
*/
function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
const items: TocItem[] = [];
if (parsed.terminology) {
items.push({ id: "terminology", title: "Terminology", level: 2 });
items.push({
id: "terminology",
title: parsed.terminologyTitle || "Terminology",
level: 2,
});
}
if (parsed.specification) {
items.push({ id: "specification", title: "Specification", level: 2 });
items.push({
id: "specification",
title: "Specification",
level: 2,
});
// Add spec subsections
if (parsed.specSections) {
for (const section of parsed.specSections) {
items.push({ id: section.id, title: section.title, level: 3 });
@@ -231,70 +296,106 @@ function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
}
/**
* Main parsing function - takes rendered HTML and returns structured content
* Main parsing function - takes markdown content and returns structured content
*/
export function parseSpecContent(html: string, version: string): ParsedSpec {
export async function parseSpecContent(
markdown: string,
version: string
): Promise<ParsedSpec> {
const svgPath = `/spec/${version}.svg`;
// Remove the title (h1) and SVG from the content for parsing
let content = html;
// Parse markdown to AST
const tree = unified().use(remarkParse).parse(markdown) as Root;
// Remove the h1 title
content = content.replace(/<h1[^>]*>[\s\S]*?<\/h1>/i, "");
// Remove title (h1) and SVG image from the tree
const nodes = tree.children.filter((node) => {
if (node.type === "heading" && (node as Heading).depth === 1) return false;
if (node.type === "paragraph") {
const text = extractText(node);
if (text.includes(".svg")) return false;
}
return true;
});
// Remove the SVG img tag
content = content.replace(/<img[^>]*\.svg[^>]*>/i, "");
// Extract each section
const introduction = extractSection(content, "Introduction", [
"Summary",
"Terminology",
"Git Common-Flow",
"FAQ",
"About",
"License",
]);
const summary = extractSection(content, "Summary", [
"Terminology",
"Git Common-Flow",
"FAQ",
"About",
"License",
]);
const terminology = extractSection(content, "Terminology", [
"Git Common-Flow",
"FAQ",
"About",
"License",
]);
const specificationRaw = extractSection(
content,
"Git Common-Flow Specification",
["FAQ", "About", "License"]
// Get heading titles
const terminologyTitle = getHeadingText(nodes, "Terminology");
const specificationTitle = getHeadingText(
nodes,
"Git Common-Flow Specification"
);
// Add anchor IDs to spec section list items for ToC navigation
const specification = addSpecSectionAnchors(specificationRaw);
// Extract section nodes
const introNodes = extractSectionNodes(nodes, "Introduction");
const summaryNodes = extractSectionNodes(nodes, "Summary");
const terminologyNodes = extractSectionNodes(nodes, "Terminology");
const specNodes = extractSectionNodes(nodes, "Git Common-Flow Specification");
const faqNodes = extractSectionNodes(nodes, "FAQ");
const aboutNodes = extractSectionNodes(nodes, "About");
const licenseNodes = extractSectionNodes(nodes, "License");
const faqContent = extractSection(content, "FAQ", ["About", "License"]);
// Extract spec sections from the first ordered list
const specSections = findSpecSections(specNodes);
const about = extractSection(content, "About", ["License"]);
// Add anchor IDs to spec list items
for (const node of specNodes) {
if (node.type === "list" && (node as List).ordered) {
addAnchorsToList(node as List, specSections);
break;
}
}
const license = extractSection(content, "License", []);
// Extract FAQ items structure
const faqItems = extractFAQFromNodes(faqNodes);
// Parse subsections
const specSections = extractSpecSections(specificationRaw);
const faq = extractFAQItems(faqContent);
// Collect FAQ answer nodes for each item
const faqAnswerNodes: RootContent[][] = [];
let currentAnswerNodes: RootContent[] = [];
for (const node of faqNodes) {
if (node.type === "heading" && (node as Heading).depth === 3) {
if (currentAnswerNodes.length > 0) {
faqAnswerNodes.push(currentAnswerNodes);
}
currentAnswerNodes = [];
} else {
currentAnswerNodes.push(node);
}
}
// Don't forget the last answer
if (currentAnswerNodes.length > 0) {
faqAnswerNodes.push(currentAnswerNodes);
}
// Convert sections to HTML
const [introduction, summary, terminology, specification, about, license] =
await Promise.all([
nodesToHtml(introNodes),
nodesToHtml(summaryNodes),
nodesToHtml(terminologyNodes),
nodesToHtml(specNodes),
nodesToHtml(aboutNodes),
nodesToHtml(licenseNodes),
]);
// Convert FAQ answers to HTML
const faqAnswers = await Promise.all(
faqAnswerNodes.map((nodes) => nodesToHtml(nodes))
);
// Assign FAQ answers
const faq = faqItems.map((item, i) => ({
...item,
answer: faqAnswers[i] || "",
}));
const parsed: ParsedSpec = {
svgPath,
introduction,
summary,
terminology,
terminologyTitle,
specification,
specificationTitle,
specSections,
faq,
about,
@@ -302,7 +403,6 @@ export function parseSpecContent(html: string, version: string): ParsedSpec {
tocItems: [],
};
// Build TOC
parsed.tocItems = buildTocItems(parsed);
return parsed;