mirror of
https://github.com/jimeh/commonflow.org.git
synced 2026-02-19 05:46:40 +00:00
wip: improve spec parsing
This commit is contained in:
@@ -1,8 +1,14 @@
|
||||
/**
|
||||
* Parses rendered spec HTML into structured sections for the single-page
|
||||
* layout.
|
||||
* Parses spec content using markdown AST for robust section extraction.
|
||||
*/
|
||||
|
||||
import { unified } from "unified";
|
||||
import remarkParse from "remark-parse";
|
||||
import remarkRehype from "remark-rehype";
|
||||
import rehypeStringify from "rehype-stringify";
|
||||
import type { Root, RootContent, Heading, List, ListItem } from "mdast";
|
||||
import type { Root as HastRoot } from "hast";
|
||||
|
||||
export interface TocItem {
|
||||
id: string;
|
||||
title: string;
|
||||
@@ -26,7 +32,9 @@ export interface ParsedSpec {
|
||||
introduction: string;
|
||||
summary: string;
|
||||
terminology: string;
|
||||
terminologyTitle: string;
|
||||
specification: string;
|
||||
specificationTitle: string;
|
||||
specSections: SpecSection[];
|
||||
faq: FAQItem[];
|
||||
about: string;
|
||||
@@ -35,7 +43,7 @@ export interface ParsedSpec {
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a heading text to a URL-friendly ID
|
||||
* Convert text to a URL-friendly ID
|
||||
*/
|
||||
function slugify(text: string): string {
|
||||
return text
|
||||
@@ -45,160 +53,212 @@ function slugify(text: string): string {
|
||||
.trim();
|
||||
}
|
||||
|
||||
type MdastNode = Root | RootContent;
|
||||
|
||||
/**
|
||||
* Extract content between two headings or to the end of the document
|
||||
* Extract plain text from an mdast node tree
|
||||
*/
|
||||
function extractSection(
|
||||
html: string,
|
||||
startHeading: string,
|
||||
endHeadings: string[] = []
|
||||
): string {
|
||||
// Find the heading (h2) - use partial match to handle additional text
|
||||
// e.g., "Git Common-Flow Specification (Common-Flow)"
|
||||
const headingPattern = new RegExp(
|
||||
`<h2[^>]*>[^<]*${escapeRegex(startHeading)}[^<]*</h2>`,
|
||||
"i"
|
||||
function extractText(node: MdastNode): string {
|
||||
if ("value" in node && typeof node.value === "string") {
|
||||
return node.value;
|
||||
}
|
||||
if ("children" in node && Array.isArray(node.children)) {
|
||||
return node.children.map((child) => extractText(child)).join("");
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Find index of heading containing specific text
|
||||
*/
|
||||
function findHeadingIndex(
|
||||
nodes: RootContent[],
|
||||
text: string,
|
||||
depth: number = 2
|
||||
): number {
|
||||
return nodes.findIndex(
|
||||
(node) =>
|
||||
node.type === "heading" &&
|
||||
(node as Heading).depth === depth &&
|
||||
extractText(node).toLowerCase().includes(text.toLowerCase())
|
||||
);
|
||||
const match = html.match(headingPattern);
|
||||
if (!match || match.index === undefined) return "";
|
||||
|
||||
const startIdx = match.index + match[0].length;
|
||||
|
||||
// Find the next section heading
|
||||
let endIdx = html.length;
|
||||
for (const endHeading of endHeadings) {
|
||||
const endPattern = new RegExp(
|
||||
`<h2[^>]*>\\s*${escapeRegex(endHeading)}\\s*</h2>`,
|
||||
"i"
|
||||
);
|
||||
const endMatch = html.slice(startIdx).match(endPattern);
|
||||
if (endMatch && endMatch.index !== undefined) {
|
||||
const possibleEnd = startIdx + endMatch.index;
|
||||
if (possibleEnd < endIdx) {
|
||||
endIdx = possibleEnd;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also check for any h2 as a fallback
|
||||
const anyH2 = html.slice(startIdx).match(/<h2[^>]*>/i);
|
||||
if (anyH2 && anyH2.index !== undefined) {
|
||||
const possibleEnd = startIdx + anyH2.index;
|
||||
if (possibleEnd < endIdx) {
|
||||
endIdx = possibleEnd;
|
||||
}
|
||||
}
|
||||
|
||||
return html.slice(startIdx, endIdx).trim();
|
||||
}
|
||||
|
||||
function escapeRegex(str: string): string {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
}
|
||||
|
||||
// Spec section titles in order (used for both ToC and anchor injection)
|
||||
const SPEC_SECTION_TITLES = [
|
||||
"TL;DR",
|
||||
"The Master Branch",
|
||||
"Change Branches",
|
||||
"Pull Requests",
|
||||
"Versioning",
|
||||
"Releases",
|
||||
"Short-Term Release Branches",
|
||||
"Long-term Release Branches",
|
||||
"Bug Fixes & Rollback",
|
||||
"Git Best Practices",
|
||||
];
|
||||
|
||||
/**
|
||||
* Extract the numbered spec sections (1. TL;DR, 2. The Master Branch, etc.)
|
||||
* Extract nodes between two headings
|
||||
*/
|
||||
function extractSpecSections(specContent: string): SpecSection[] {
|
||||
function extractSectionNodes(
|
||||
nodes: RootContent[],
|
||||
startText: string,
|
||||
depth: number = 2
|
||||
): RootContent[] {
|
||||
const startIdx = findHeadingIndex(nodes, startText, depth);
|
||||
if (startIdx === -1) return [];
|
||||
|
||||
// Find the next heading of same or higher level
|
||||
let endIdx = nodes.length;
|
||||
for (let i = startIdx + 1; i < nodes.length; i++) {
|
||||
const node = nodes[i];
|
||||
if (node.type === "heading" && (node as Heading).depth <= depth) {
|
||||
endIdx = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Return nodes after the heading (not including the heading itself)
|
||||
return nodes.slice(startIdx + 1, endIdx);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full heading text
|
||||
*/
|
||||
function getHeadingText(
|
||||
nodes: RootContent[],
|
||||
text: string,
|
||||
depth: number = 2
|
||||
): string {
|
||||
const idx = findHeadingIndex(nodes, text, depth);
|
||||
if (idx === -1) return text;
|
||||
return extractText(nodes[idx]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert mdast nodes to HTML string
|
||||
*/
|
||||
async function nodesToHtml(nodes: RootContent[]): Promise<string> {
|
||||
if (nodes.length === 0) return "";
|
||||
|
||||
// Create a root node with these children
|
||||
const root: Root = { type: "root", children: nodes };
|
||||
|
||||
const result = await unified()
|
||||
.use(remarkRehype, { allowDangerousHtml: true })
|
||||
.use(rehypeStringify, { allowDangerousHtml: true })
|
||||
.run(root);
|
||||
|
||||
const html = await unified()
|
||||
.use(rehypeStringify, { allowDangerousHtml: true })
|
||||
.stringify(result as HastRoot);
|
||||
|
||||
return html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract top-level list item titles from an ordered list
|
||||
*/
|
||||
function extractListItemTitles(list: List): string[] {
|
||||
const titles: string[] = [];
|
||||
|
||||
for (const item of list.children) {
|
||||
if (item.type !== "listItem") continue;
|
||||
|
||||
// Get the first paragraph or text content of the list item
|
||||
// The title is the text before any nested list
|
||||
let title = "";
|
||||
for (const child of item.children) {
|
||||
if (child.type === "list") break; // Stop at nested list
|
||||
if (child.type === "paragraph") {
|
||||
title = extractText(child);
|
||||
break;
|
||||
}
|
||||
// Handle inline text directly in list item
|
||||
title += extractText(child);
|
||||
}
|
||||
|
||||
title = title.split("\n")[0].trim();
|
||||
if (title) {
|
||||
titles.push(title);
|
||||
}
|
||||
}
|
||||
|
||||
return titles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Find the first ordered list in nodes and extract its structure
|
||||
*/
|
||||
function findSpecSections(nodes: RootContent[]): SpecSection[] {
|
||||
const sections: SpecSection[] = [];
|
||||
|
||||
// The spec uses an ordered list with nested items
|
||||
// Each top-level li starts a new section
|
||||
const olMatch = specContent.match(/<ol[^>]*>([\s\S]*?)<\/ol>/i);
|
||||
if (!olMatch) return sections;
|
||||
|
||||
// Find each section by looking for the title pattern
|
||||
for (const title of SPEC_SECTION_TITLES) {
|
||||
const id = slugify(title);
|
||||
|
||||
// For the content, we'll just use the title for navigation
|
||||
// The actual content stays in the main specification block
|
||||
sections.push({
|
||||
id: `spec-${id}`,
|
||||
title,
|
||||
content: "", // Content handled inline
|
||||
});
|
||||
for (const node of nodes) {
|
||||
if (node.type === "list" && (node as List).ordered) {
|
||||
const titles = extractListItemTitles(node as List);
|
||||
for (const title of titles) {
|
||||
sections.push({
|
||||
id: `spec-${slugify(title)}`,
|
||||
title,
|
||||
content: "",
|
||||
});
|
||||
}
|
||||
break; // Only process first ordered list
|
||||
}
|
||||
}
|
||||
|
||||
return sections;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add anchor IDs to spec section list items.
|
||||
* Finds top-level <li> elements that start with section titles and adds IDs.
|
||||
* Add anchor IDs to list items in the spec ordered list
|
||||
*/
|
||||
function addSpecSectionAnchors(specContent: string): string {
|
||||
let result = specContent;
|
||||
function addAnchorsToList(list: List, sections: SpecSection[]): void {
|
||||
const titleMap = new Map(sections.map((s) => [s.title, s.id]));
|
||||
|
||||
for (const title of SPEC_SECTION_TITLES) {
|
||||
const id = `spec-${slugify(title)}`;
|
||||
// Match <li> followed by the section title (possibly with whitespace)
|
||||
// The title appears right after <li> in the rendered HTML
|
||||
const pattern = new RegExp(
|
||||
`(<li>)(\\s*${escapeRegex(title)})`,
|
||||
"i"
|
||||
);
|
||||
result = result.replace(pattern, `<li id="${id}">$2`);
|
||||
for (const item of list.children) {
|
||||
if (item.type !== "listItem") continue;
|
||||
|
||||
// Get the title of this item
|
||||
let title = "";
|
||||
for (const child of item.children) {
|
||||
if (child.type === "list") break;
|
||||
if (child.type === "paragraph") {
|
||||
title = extractText(child).split("\n")[0].trim();
|
||||
break;
|
||||
}
|
||||
title += extractText(child);
|
||||
}
|
||||
title = title.split("\n")[0].trim();
|
||||
|
||||
// Add ID as data attribute (will be processed by rehype)
|
||||
const id = titleMap.get(title);
|
||||
if (id) {
|
||||
// Add hProperties for rehype to convert to HTML id attribute
|
||||
(item as ListItem & { data?: { hProperties?: { id?: string } } }).data = {
|
||||
hProperties: { id },
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract FAQ items from the FAQ section HTML
|
||||
* Extract FAQ items from FAQ section nodes
|
||||
*/
|
||||
function extractFAQItems(faqContent: string): FAQItem[] {
|
||||
function extractFAQFromNodes(nodes: RootContent[]): FAQItem[] {
|
||||
const items: FAQItem[] = [];
|
||||
let currentQuestion = "";
|
||||
let currentId = "";
|
||||
|
||||
// Split by h3 headings
|
||||
const h3Pattern = /<h3[^>]*>([\s\S]*?)<\/h3>/gi;
|
||||
let lastIndex = 0;
|
||||
let lastQuestion = "";
|
||||
let lastId = "";
|
||||
for (const node of nodes) {
|
||||
if (node.type === "heading" && (node as Heading).depth === 3) {
|
||||
// Save previous FAQ item if we had one
|
||||
if (currentQuestion) {
|
||||
items.push({
|
||||
id: currentId,
|
||||
question: currentQuestion,
|
||||
answer: "", // Placeholder, will be filled later
|
||||
});
|
||||
}
|
||||
|
||||
const matches = [...faqContent.matchAll(h3Pattern)];
|
||||
|
||||
for (let i = 0; i < matches.length; i++) {
|
||||
const match = matches[i];
|
||||
const question = match[1].replace(/<[^>]+>/g, "").trim();
|
||||
const id = slugify(question).slice(0, 50);
|
||||
|
||||
if (i > 0 && match.index !== undefined) {
|
||||
// Get content between previous h3 and this one
|
||||
const answer = faqContent.slice(lastIndex, match.index).trim();
|
||||
items.push({
|
||||
id: `faq-${lastId}`,
|
||||
question: lastQuestion,
|
||||
answer,
|
||||
});
|
||||
currentQuestion = extractText(node);
|
||||
currentId = `faq-${slugify(currentQuestion).slice(0, 50)}`;
|
||||
}
|
||||
|
||||
lastQuestion = question;
|
||||
lastId = id;
|
||||
lastIndex = match.index! + match[0].length;
|
||||
}
|
||||
|
||||
// Don't forget the last FAQ item
|
||||
if (lastQuestion) {
|
||||
const answer = faqContent.slice(lastIndex).trim();
|
||||
// Don't forget the last item
|
||||
if (currentQuestion) {
|
||||
items.push({
|
||||
id: `faq-${lastId}`,
|
||||
question: lastQuestion,
|
||||
answer,
|
||||
id: currentId,
|
||||
question: currentQuestion,
|
||||
answer: "",
|
||||
});
|
||||
}
|
||||
|
||||
@@ -206,20 +266,25 @@ function extractFAQItems(faqContent: string): FAQItem[] {
|
||||
}
|
||||
|
||||
/**
|
||||
* Build table of contents from parsed sections.
|
||||
* Only includes sections rendered in SpecSection (Terminology + Specification).
|
||||
* Introduction/Summary are in AboutSection and excluded from this ToC.
|
||||
* Build table of contents from parsed sections
|
||||
*/
|
||||
function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
|
||||
const items: TocItem[] = [];
|
||||
|
||||
if (parsed.terminology) {
|
||||
items.push({ id: "terminology", title: "Terminology", level: 2 });
|
||||
items.push({
|
||||
id: "terminology",
|
||||
title: parsed.terminologyTitle || "Terminology",
|
||||
level: 2,
|
||||
});
|
||||
}
|
||||
if (parsed.specification) {
|
||||
items.push({ id: "specification", title: "Specification", level: 2 });
|
||||
items.push({
|
||||
id: "specification",
|
||||
title: "Specification",
|
||||
level: 2,
|
||||
});
|
||||
|
||||
// Add spec subsections
|
||||
if (parsed.specSections) {
|
||||
for (const section of parsed.specSections) {
|
||||
items.push({ id: section.id, title: section.title, level: 3 });
|
||||
@@ -231,70 +296,106 @@ function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
|
||||
}
|
||||
|
||||
/**
|
||||
* Main parsing function - takes rendered HTML and returns structured content
|
||||
* Main parsing function - takes markdown content and returns structured content
|
||||
*/
|
||||
export function parseSpecContent(html: string, version: string): ParsedSpec {
|
||||
export async function parseSpecContent(
|
||||
markdown: string,
|
||||
version: string
|
||||
): Promise<ParsedSpec> {
|
||||
const svgPath = `/spec/${version}.svg`;
|
||||
|
||||
// Remove the title (h1) and SVG from the content for parsing
|
||||
let content = html;
|
||||
// Parse markdown to AST
|
||||
const tree = unified().use(remarkParse).parse(markdown) as Root;
|
||||
|
||||
// Remove the h1 title
|
||||
content = content.replace(/<h1[^>]*>[\s\S]*?<\/h1>/i, "");
|
||||
// Remove title (h1) and SVG image from the tree
|
||||
const nodes = tree.children.filter((node) => {
|
||||
if (node.type === "heading" && (node as Heading).depth === 1) return false;
|
||||
if (node.type === "paragraph") {
|
||||
const text = extractText(node);
|
||||
if (text.includes(".svg")) return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
||||
// Remove the SVG img tag
|
||||
content = content.replace(/<img[^>]*\.svg[^>]*>/i, "");
|
||||
|
||||
// Extract each section
|
||||
const introduction = extractSection(content, "Introduction", [
|
||||
"Summary",
|
||||
"Terminology",
|
||||
"Git Common-Flow",
|
||||
"FAQ",
|
||||
"About",
|
||||
"License",
|
||||
]);
|
||||
|
||||
const summary = extractSection(content, "Summary", [
|
||||
"Terminology",
|
||||
"Git Common-Flow",
|
||||
"FAQ",
|
||||
"About",
|
||||
"License",
|
||||
]);
|
||||
|
||||
const terminology = extractSection(content, "Terminology", [
|
||||
"Git Common-Flow",
|
||||
"FAQ",
|
||||
"About",
|
||||
"License",
|
||||
]);
|
||||
|
||||
const specificationRaw = extractSection(
|
||||
content,
|
||||
"Git Common-Flow Specification",
|
||||
["FAQ", "About", "License"]
|
||||
// Get heading titles
|
||||
const terminologyTitle = getHeadingText(nodes, "Terminology");
|
||||
const specificationTitle = getHeadingText(
|
||||
nodes,
|
||||
"Git Common-Flow Specification"
|
||||
);
|
||||
|
||||
// Add anchor IDs to spec section list items for ToC navigation
|
||||
const specification = addSpecSectionAnchors(specificationRaw);
|
||||
// Extract section nodes
|
||||
const introNodes = extractSectionNodes(nodes, "Introduction");
|
||||
const summaryNodes = extractSectionNodes(nodes, "Summary");
|
||||
const terminologyNodes = extractSectionNodes(nodes, "Terminology");
|
||||
const specNodes = extractSectionNodes(nodes, "Git Common-Flow Specification");
|
||||
const faqNodes = extractSectionNodes(nodes, "FAQ");
|
||||
const aboutNodes = extractSectionNodes(nodes, "About");
|
||||
const licenseNodes = extractSectionNodes(nodes, "License");
|
||||
|
||||
const faqContent = extractSection(content, "FAQ", ["About", "License"]);
|
||||
// Extract spec sections from the first ordered list
|
||||
const specSections = findSpecSections(specNodes);
|
||||
|
||||
const about = extractSection(content, "About", ["License"]);
|
||||
// Add anchor IDs to spec list items
|
||||
for (const node of specNodes) {
|
||||
if (node.type === "list" && (node as List).ordered) {
|
||||
addAnchorsToList(node as List, specSections);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const license = extractSection(content, "License", []);
|
||||
// Extract FAQ items structure
|
||||
const faqItems = extractFAQFromNodes(faqNodes);
|
||||
|
||||
// Parse subsections
|
||||
const specSections = extractSpecSections(specificationRaw);
|
||||
const faq = extractFAQItems(faqContent);
|
||||
// Collect FAQ answer nodes for each item
|
||||
const faqAnswerNodes: RootContent[][] = [];
|
||||
let currentAnswerNodes: RootContent[] = [];
|
||||
|
||||
for (const node of faqNodes) {
|
||||
if (node.type === "heading" && (node as Heading).depth === 3) {
|
||||
if (currentAnswerNodes.length > 0) {
|
||||
faqAnswerNodes.push(currentAnswerNodes);
|
||||
}
|
||||
currentAnswerNodes = [];
|
||||
} else {
|
||||
currentAnswerNodes.push(node);
|
||||
}
|
||||
}
|
||||
// Don't forget the last answer
|
||||
if (currentAnswerNodes.length > 0) {
|
||||
faqAnswerNodes.push(currentAnswerNodes);
|
||||
}
|
||||
|
||||
// Convert sections to HTML
|
||||
const [introduction, summary, terminology, specification, about, license] =
|
||||
await Promise.all([
|
||||
nodesToHtml(introNodes),
|
||||
nodesToHtml(summaryNodes),
|
||||
nodesToHtml(terminologyNodes),
|
||||
nodesToHtml(specNodes),
|
||||
nodesToHtml(aboutNodes),
|
||||
nodesToHtml(licenseNodes),
|
||||
]);
|
||||
|
||||
// Convert FAQ answers to HTML
|
||||
const faqAnswers = await Promise.all(
|
||||
faqAnswerNodes.map((nodes) => nodesToHtml(nodes))
|
||||
);
|
||||
|
||||
// Assign FAQ answers
|
||||
const faq = faqItems.map((item, i) => ({
|
||||
...item,
|
||||
answer: faqAnswers[i] || "",
|
||||
}));
|
||||
|
||||
const parsed: ParsedSpec = {
|
||||
svgPath,
|
||||
introduction,
|
||||
summary,
|
||||
terminology,
|
||||
terminologyTitle,
|
||||
specification,
|
||||
specificationTitle,
|
||||
specSections,
|
||||
faq,
|
||||
about,
|
||||
@@ -302,7 +403,6 @@ export function parseSpecContent(html: string, version: string): ParsedSpec {
|
||||
tocItems: [],
|
||||
};
|
||||
|
||||
// Build TOC
|
||||
parsed.tocItems = buildTocItems(parsed);
|
||||
|
||||
return parsed;
|
||||
|
||||
Reference in New Issue
Block a user