wip: improve spec parsing

This commit is contained in:
2026-01-10 21:52:45 +00:00
parent f1fa264ed7
commit 208219ca2c
10 changed files with 324 additions and 255 deletions

View File

@@ -199,7 +199,7 @@ its merge target, allowing others to review, discuss and approve the changes.</l
effectively just a git tag named after the version of the release.</li>
<li><strong>Release Branches</strong> - Used both for short-term preparations of a release, and
also for long-term maintenance of older version.</li>
</ul> </section> <!-- Main specification --> <section id="specification" data-astro-cid-6lwcykzv> <h2 data-astro-cid-6lwcykzv>Git Common-Flow Specification</h2> <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
</ul> </section> <!-- Main specification --> <section id="specification" data-astro-cid-6lwcykzv> <h2 data-astro-cid-6lwcykzv>Specification</h2> <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
interpreted as described in <a href="https://tools.ietf.org/html/rfc2119">RFC 2119</a>.</p>
<ol>
@@ -387,7 +387,7 @@ in question. Meaning it MUST always be in a non-broken state, MUST NOT be
force pushed to, etc.</li>
</ol>
</li>
<li>Bug Fixes &#x26; Rollback
<li id="spec-bug-fixes-rollback">Bug Fixes &#x26; Rollback
<ol>
<li>You MUST NOT under any circumstances force push to the master branch or
to long-term release branches.</li>

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -188,7 +188,7 @@ its merge target, allowing others to review, discuss and approve the changes.</l
effectively just a git tag named after the version of the release.</li>
<li><strong>Release Branches</strong> - Used both for short-term preparations of a release, and
also for long-term maintenance of older version.</li>
</ul> </section> <!-- Main specification --> <section id="specification" data-astro-cid-6lwcykzv> <h2 data-astro-cid-6lwcykzv>Git Common-Flow Specification</h2> <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
</ul> </section> <!-- Main specification --> <section id="specification" data-astro-cid-6lwcykzv> <h2 data-astro-cid-6lwcykzv>Specification</h2> <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
interpreted as described in <a href="https://tools.ietf.org/html/rfc2119">RFC 2119</a>.</p>
<ol>
@@ -375,7 +375,7 @@ in question. Meaning it MUST always be in a non-broken state, MUST NOT be
force pushed to, etc.</li>
</ol>
</li>
<li>Bug Fixes &#x26; Rollback
<li id="spec-bug-fixes-rollback">Bug Fixes &#x26; Rollback
<ol>
<li>You MUST NOT under any circumstances force push to the master branch or
to long-term release branches.</li>

View File

@@ -199,7 +199,7 @@ its merge target, allowing others to review, discuss and approve the changes.</l
effectively just a git tag named after the version of the release.</li>
<li><strong>Release Branches</strong> - Used both for short-term preparations of a release, and
also for long-term maintenance of older version.</li>
</ul> </section> <!-- Main specification --> <section id="specification" data-astro-cid-6lwcykzv> <h2 data-astro-cid-6lwcykzv>Git Common-Flow Specification</h2> <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
</ul> </section> <!-- Main specification --> <section id="specification" data-astro-cid-6lwcykzv> <h2 data-astro-cid-6lwcykzv>Specification</h2> <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
"SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be
interpreted as described in <a href="https://tools.ietf.org/html/rfc2119">RFC 2119</a>.</p>
<ol>
@@ -387,7 +387,7 @@ in question. Meaning it MUST always be in a non-broken state, MUST NOT be
force pushed to, etc.</li>
</ol>
</li>
<li>Bug Fixes &#x26; Rollback
<li id="spec-bug-fixes-rollback">Bug Fixes &#x26; Rollback
<ol>
<li>You MUST NOT under any circumstances force push to the master branch or
to long-term release branches.</li>

View File

@@ -4,11 +4,12 @@ import type { TocItem } from "../utils/parseSpecContent";
interface Props {
terminology: string;
terminologyTitle: string;
specification: string;
tocItems: TocItem[];
}
const { terminology, specification, tocItems } = Astro.props;
const { terminology, terminologyTitle, specification, tocItems } = Astro.props;
---
<section id="spec" class="py-20 sm:py-28">
@@ -33,13 +34,13 @@ const { terminology, specification, tocItems } = Astro.props;
<article class="prose-spec spec-content">
<!-- Terminology -->
<section id="terminology">
<h2>Terminology</h2>
<h2>{terminologyTitle}</h2>
<Fragment set:html={terminology} />
</section>
<!-- Main specification -->
<section id="specification">
<h2>Git Common-Flow Specification</h2>
<h2>Specification</h2>
<Fragment set:html={specification} />
</section>
</article>

View File

@@ -2,10 +2,6 @@
import { getCollection } from "astro:content";
import * as fs from "node:fs";
import * as path from "node:path";
import { unified } from "unified";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import rehypeStringify from "rehype-stringify";
import SinglePage from "../layouts/SinglePage.astro";
import Header from "../components/Header.astro";
@@ -25,28 +21,15 @@ if (!spec) {
throw new Error(`Spec version ${version} not found`);
}
// Read and process the markdown file
const filePath = path.join(
process.cwd(),
"src/content/spec",
`${version}.md`
);
// Read the markdown file
const filePath = path.join(process.cwd(), "src/content/spec", `${version}.md`);
const content = fs.readFileSync(filePath, "utf-8");
// Remove frontmatter
const body = content.replace(/^---[\s\S]*?---\n/, "");
const markdown = content.replace(/^---[\s\S]*?---\n/, "");
// Process markdown to HTML
const result = await unified()
.use(remarkParse)
.use(remarkRehype, { allowDangerousHtml: true })
.use(rehypeStringify, { allowDangerousHtml: true })
.process(body);
const html = String(result);
// Parse the content into sections
const parsed = parseSpecContent(html, version);
// Parse the content into sections (handles markdown -> HTML internally)
const parsed = await parseSpecContent(markdown, version);
---
<SinglePage title={spec.data.title} version={version}>
@@ -64,6 +47,7 @@ const parsed = parseSpecContent(html, version);
<SpecSection
terminology={parsed.terminology}
terminologyTitle={parsed.terminologyTitle}
specification={parsed.specification}
tocItems={parsed.tocItems}
/>

View File

@@ -2,10 +2,6 @@
import { getCollection } from "astro:content";
import * as fs from "node:fs";
import * as path from "node:path";
import { unified } from "unified";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import rehypeStringify from "rehype-stringify";
import SinglePage from "../../layouts/SinglePage.astro";
import Header from "../../components/Header.astro";
@@ -27,28 +23,15 @@ export async function getStaticPaths() {
const { spec } = Astro.props;
const version = spec.data.version;
// Read and process the markdown file
const filePath = path.join(
process.cwd(),
"src/content/spec",
`${version}.md`
);
// Read the markdown file
const filePath = path.join(process.cwd(), "src/content/spec", `${version}.md`);
const content = fs.readFileSync(filePath, "utf-8");
// Remove frontmatter
const body = content.replace(/^---[\s\S]*?---\n/, "");
const markdown = content.replace(/^---[\s\S]*?---\n/, "");
// Process markdown to HTML
const result = await unified()
.use(remarkParse)
.use(remarkRehype, { allowDangerousHtml: true })
.use(rehypeStringify, { allowDangerousHtml: true })
.process(body);
const html = String(result);
// Parse the content into sections
const parsed = parseSpecContent(html, version);
// Parse the content into sections (handles markdown -> HTML internally)
const parsed = await parseSpecContent(markdown, version);
---
<SinglePage title={spec.data.title} version={version}>
@@ -66,6 +49,7 @@ const parsed = parseSpecContent(html, version);
<SpecSection
terminology={parsed.terminology}
terminologyTitle={parsed.terminologyTitle}
specification={parsed.specification}
tocItems={parsed.tocItems}
/>

View File

@@ -1,8 +1,14 @@
/**
* Parses rendered spec HTML into structured sections for the single-page
* layout.
* Parses spec content using markdown AST for robust section extraction.
*/
import { unified } from "unified";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import rehypeStringify from "rehype-stringify";
import type { Root, RootContent, Heading, List, ListItem } from "mdast";
import type { Root as HastRoot } from "hast";
export interface TocItem {
id: string;
title: string;
@@ -26,7 +32,9 @@ export interface ParsedSpec {
introduction: string;
summary: string;
terminology: string;
terminologyTitle: string;
specification: string;
specificationTitle: string;
specSections: SpecSection[];
faq: FAQItem[];
about: string;
@@ -35,7 +43,7 @@ export interface ParsedSpec {
}
/**
* Convert a heading text to a URL-friendly ID
* Convert text to a URL-friendly ID
*/
function slugify(text: string): string {
return text
@@ -45,160 +53,212 @@ function slugify(text: string): string {
.trim();
}
type MdastNode = Root | RootContent;
/**
* Extract content between two headings or to the end of the document
* Extract plain text from an mdast node tree
*/
function extractSection(
html: string,
startHeading: string,
endHeadings: string[] = []
): string {
// Find the heading (h2) - use partial match to handle additional text
// e.g., "Git Common-Flow Specification (Common-Flow)"
const headingPattern = new RegExp(
`<h2[^>]*>[^<]*${escapeRegex(startHeading)}[^<]*</h2>`,
"i"
function extractText(node: MdastNode): string {
if ("value" in node && typeof node.value === "string") {
return node.value;
}
if ("children" in node && Array.isArray(node.children)) {
return node.children.map((child) => extractText(child)).join("");
}
return "";
}
/**
* Find index of heading containing specific text
*/
function findHeadingIndex(
nodes: RootContent[],
text: string,
depth: number = 2
): number {
return nodes.findIndex(
(node) =>
node.type === "heading" &&
(node as Heading).depth === depth &&
extractText(node).toLowerCase().includes(text.toLowerCase())
);
const match = html.match(headingPattern);
if (!match || match.index === undefined) return "";
const startIdx = match.index + match[0].length;
// Find the next section heading
let endIdx = html.length;
for (const endHeading of endHeadings) {
const endPattern = new RegExp(
`<h2[^>]*>\\s*${escapeRegex(endHeading)}\\s*</h2>`,
"i"
);
const endMatch = html.slice(startIdx).match(endPattern);
if (endMatch && endMatch.index !== undefined) {
const possibleEnd = startIdx + endMatch.index;
if (possibleEnd < endIdx) {
endIdx = possibleEnd;
}
}
}
// Also check for any h2 as a fallback
const anyH2 = html.slice(startIdx).match(/<h2[^>]*>/i);
if (anyH2 && anyH2.index !== undefined) {
const possibleEnd = startIdx + anyH2.index;
if (possibleEnd < endIdx) {
endIdx = possibleEnd;
}
}
return html.slice(startIdx, endIdx).trim();
}
function escapeRegex(str: string): string {
return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}
// Spec section titles in order (used for both ToC and anchor injection)
const SPEC_SECTION_TITLES = [
"TL;DR",
"The Master Branch",
"Change Branches",
"Pull Requests",
"Versioning",
"Releases",
"Short-Term Release Branches",
"Long-term Release Branches",
"Bug Fixes & Rollback",
"Git Best Practices",
];
/**
* Extract the numbered spec sections (1. TL;DR, 2. The Master Branch, etc.)
* Extract nodes between two headings
*/
function extractSpecSections(specContent: string): SpecSection[] {
function extractSectionNodes(
nodes: RootContent[],
startText: string,
depth: number = 2
): RootContent[] {
const startIdx = findHeadingIndex(nodes, startText, depth);
if (startIdx === -1) return [];
// Find the next heading of same or higher level
let endIdx = nodes.length;
for (let i = startIdx + 1; i < nodes.length; i++) {
const node = nodes[i];
if (node.type === "heading" && (node as Heading).depth <= depth) {
endIdx = i;
break;
}
}
// Return nodes after the heading (not including the heading itself)
return nodes.slice(startIdx + 1, endIdx);
}
/**
* Get the full heading text
*/
function getHeadingText(
nodes: RootContent[],
text: string,
depth: number = 2
): string {
const idx = findHeadingIndex(nodes, text, depth);
if (idx === -1) return text;
return extractText(nodes[idx]);
}
/**
* Convert mdast nodes to HTML string
*/
async function nodesToHtml(nodes: RootContent[]): Promise<string> {
if (nodes.length === 0) return "";
// Create a root node with these children
const root: Root = { type: "root", children: nodes };
const result = await unified()
.use(remarkRehype, { allowDangerousHtml: true })
.use(rehypeStringify, { allowDangerousHtml: true })
.run(root);
const html = await unified()
.use(rehypeStringify, { allowDangerousHtml: true })
.stringify(result as HastRoot);
return html;
}
/**
* Extract top-level list item titles from an ordered list
*/
function extractListItemTitles(list: List): string[] {
const titles: string[] = [];
for (const item of list.children) {
if (item.type !== "listItem") continue;
// Get the first paragraph or text content of the list item
// The title is the text before any nested list
let title = "";
for (const child of item.children) {
if (child.type === "list") break; // Stop at nested list
if (child.type === "paragraph") {
title = extractText(child);
break;
}
// Handle inline text directly in list item
title += extractText(child);
}
title = title.split("\n")[0].trim();
if (title) {
titles.push(title);
}
}
return titles;
}
/**
* Find the first ordered list in nodes and extract its structure
*/
function findSpecSections(nodes: RootContent[]): SpecSection[] {
const sections: SpecSection[] = [];
// The spec uses an ordered list with nested items
// Each top-level li starts a new section
const olMatch = specContent.match(/<ol[^>]*>([\s\S]*?)<\/ol>/i);
if (!olMatch) return sections;
// Find each section by looking for the title pattern
for (const title of SPEC_SECTION_TITLES) {
const id = slugify(title);
// For the content, we'll just use the title for navigation
// The actual content stays in the main specification block
sections.push({
id: `spec-${id}`,
title,
content: "", // Content handled inline
});
for (const node of nodes) {
if (node.type === "list" && (node as List).ordered) {
const titles = extractListItemTitles(node as List);
for (const title of titles) {
sections.push({
id: `spec-${slugify(title)}`,
title,
content: "",
});
}
break; // Only process first ordered list
}
}
return sections;
}
/**
* Add anchor IDs to spec section list items.
* Finds top-level <li> elements that start with section titles and adds IDs.
* Add anchor IDs to list items in the spec ordered list
*/
function addSpecSectionAnchors(specContent: string): string {
let result = specContent;
function addAnchorsToList(list: List, sections: SpecSection[]): void {
const titleMap = new Map(sections.map((s) => [s.title, s.id]));
for (const title of SPEC_SECTION_TITLES) {
const id = `spec-${slugify(title)}`;
// Match <li> followed by the section title (possibly with whitespace)
// The title appears right after <li> in the rendered HTML
const pattern = new RegExp(
`(<li>)(\\s*${escapeRegex(title)})`,
"i"
);
result = result.replace(pattern, `<li id="${id}">$2`);
for (const item of list.children) {
if (item.type !== "listItem") continue;
// Get the title of this item
let title = "";
for (const child of item.children) {
if (child.type === "list") break;
if (child.type === "paragraph") {
title = extractText(child).split("\n")[0].trim();
break;
}
title += extractText(child);
}
title = title.split("\n")[0].trim();
// Add ID as data attribute (will be processed by rehype)
const id = titleMap.get(title);
if (id) {
// Add hProperties for rehype to convert to HTML id attribute
(item as ListItem & { data?: { hProperties?: { id?: string } } }).data = {
hProperties: { id },
};
}
}
return result;
}
/**
* Extract FAQ items from the FAQ section HTML
* Extract FAQ items from FAQ section nodes
*/
function extractFAQItems(faqContent: string): FAQItem[] {
function extractFAQFromNodes(nodes: RootContent[]): FAQItem[] {
const items: FAQItem[] = [];
let currentQuestion = "";
let currentId = "";
// Split by h3 headings
const h3Pattern = /<h3[^>]*>([\s\S]*?)<\/h3>/gi;
let lastIndex = 0;
let lastQuestion = "";
let lastId = "";
for (const node of nodes) {
if (node.type === "heading" && (node as Heading).depth === 3) {
// Save previous FAQ item if we had one
if (currentQuestion) {
items.push({
id: currentId,
question: currentQuestion,
answer: "", // Placeholder, will be filled later
});
}
const matches = [...faqContent.matchAll(h3Pattern)];
for (let i = 0; i < matches.length; i++) {
const match = matches[i];
const question = match[1].replace(/<[^>]+>/g, "").trim();
const id = slugify(question).slice(0, 50);
if (i > 0 && match.index !== undefined) {
// Get content between previous h3 and this one
const answer = faqContent.slice(lastIndex, match.index).trim();
items.push({
id: `faq-${lastId}`,
question: lastQuestion,
answer,
});
currentQuestion = extractText(node);
currentId = `faq-${slugify(currentQuestion).slice(0, 50)}`;
}
lastQuestion = question;
lastId = id;
lastIndex = match.index! + match[0].length;
}
// Don't forget the last FAQ item
if (lastQuestion) {
const answer = faqContent.slice(lastIndex).trim();
// Don't forget the last item
if (currentQuestion) {
items.push({
id: `faq-${lastId}`,
question: lastQuestion,
answer,
id: currentId,
question: currentQuestion,
answer: "",
});
}
@@ -206,20 +266,25 @@ function extractFAQItems(faqContent: string): FAQItem[] {
}
/**
* Build table of contents from parsed sections.
* Only includes sections rendered in SpecSection (Terminology + Specification).
* Introduction/Summary are in AboutSection and excluded from this ToC.
* Build table of contents from parsed sections
*/
function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
const items: TocItem[] = [];
if (parsed.terminology) {
items.push({ id: "terminology", title: "Terminology", level: 2 });
items.push({
id: "terminology",
title: parsed.terminologyTitle || "Terminology",
level: 2,
});
}
if (parsed.specification) {
items.push({ id: "specification", title: "Specification", level: 2 });
items.push({
id: "specification",
title: "Specification",
level: 2,
});
// Add spec subsections
if (parsed.specSections) {
for (const section of parsed.specSections) {
items.push({ id: section.id, title: section.title, level: 3 });
@@ -231,70 +296,106 @@ function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
}
/**
* Main parsing function - takes rendered HTML and returns structured content
* Main parsing function - takes markdown content and returns structured content
*/
export function parseSpecContent(html: string, version: string): ParsedSpec {
export async function parseSpecContent(
markdown: string,
version: string
): Promise<ParsedSpec> {
const svgPath = `/spec/${version}.svg`;
// Remove the title (h1) and SVG from the content for parsing
let content = html;
// Parse markdown to AST
const tree = unified().use(remarkParse).parse(markdown) as Root;
// Remove the h1 title
content = content.replace(/<h1[^>]*>[\s\S]*?<\/h1>/i, "");
// Remove title (h1) and SVG image from the tree
const nodes = tree.children.filter((node) => {
if (node.type === "heading" && (node as Heading).depth === 1) return false;
if (node.type === "paragraph") {
const text = extractText(node);
if (text.includes(".svg")) return false;
}
return true;
});
// Remove the SVG img tag
content = content.replace(/<img[^>]*\.svg[^>]*>/i, "");
// Extract each section
const introduction = extractSection(content, "Introduction", [
"Summary",
"Terminology",
"Git Common-Flow",
"FAQ",
"About",
"License",
]);
const summary = extractSection(content, "Summary", [
"Terminology",
"Git Common-Flow",
"FAQ",
"About",
"License",
]);
const terminology = extractSection(content, "Terminology", [
"Git Common-Flow",
"FAQ",
"About",
"License",
]);
const specificationRaw = extractSection(
content,
"Git Common-Flow Specification",
["FAQ", "About", "License"]
// Get heading titles
const terminologyTitle = getHeadingText(nodes, "Terminology");
const specificationTitle = getHeadingText(
nodes,
"Git Common-Flow Specification"
);
// Add anchor IDs to spec section list items for ToC navigation
const specification = addSpecSectionAnchors(specificationRaw);
// Extract section nodes
const introNodes = extractSectionNodes(nodes, "Introduction");
const summaryNodes = extractSectionNodes(nodes, "Summary");
const terminologyNodes = extractSectionNodes(nodes, "Terminology");
const specNodes = extractSectionNodes(nodes, "Git Common-Flow Specification");
const faqNodes = extractSectionNodes(nodes, "FAQ");
const aboutNodes = extractSectionNodes(nodes, "About");
const licenseNodes = extractSectionNodes(nodes, "License");
const faqContent = extractSection(content, "FAQ", ["About", "License"]);
// Extract spec sections from the first ordered list
const specSections = findSpecSections(specNodes);
const about = extractSection(content, "About", ["License"]);
// Add anchor IDs to spec list items
for (const node of specNodes) {
if (node.type === "list" && (node as List).ordered) {
addAnchorsToList(node as List, specSections);
break;
}
}
const license = extractSection(content, "License", []);
// Extract FAQ items structure
const faqItems = extractFAQFromNodes(faqNodes);
// Parse subsections
const specSections = extractSpecSections(specificationRaw);
const faq = extractFAQItems(faqContent);
// Collect FAQ answer nodes for each item
const faqAnswerNodes: RootContent[][] = [];
let currentAnswerNodes: RootContent[] = [];
for (const node of faqNodes) {
if (node.type === "heading" && (node as Heading).depth === 3) {
if (currentAnswerNodes.length > 0) {
faqAnswerNodes.push(currentAnswerNodes);
}
currentAnswerNodes = [];
} else {
currentAnswerNodes.push(node);
}
}
// Don't forget the last answer
if (currentAnswerNodes.length > 0) {
faqAnswerNodes.push(currentAnswerNodes);
}
// Convert sections to HTML
const [introduction, summary, terminology, specification, about, license] =
await Promise.all([
nodesToHtml(introNodes),
nodesToHtml(summaryNodes),
nodesToHtml(terminologyNodes),
nodesToHtml(specNodes),
nodesToHtml(aboutNodes),
nodesToHtml(licenseNodes),
]);
// Convert FAQ answers to HTML
const faqAnswers = await Promise.all(
faqAnswerNodes.map((nodes) => nodesToHtml(nodes))
);
// Assign FAQ answers
const faq = faqItems.map((item, i) => ({
...item,
answer: faqAnswers[i] || "",
}));
const parsed: ParsedSpec = {
svgPath,
introduction,
summary,
terminology,
terminologyTitle,
specification,
specificationTitle,
specSections,
faq,
about,
@@ -302,7 +403,6 @@ export function parseSpecContent(html: string, version: string): ParsedSpec {
tocItems: [],
};
// Build TOC
parsed.tocItems = buildTocItems(parsed);
return parsed;