Files
commonflow.org/src/utils/parseSpecContent.ts

438 lines
11 KiB
TypeScript

/**
* Parses spec content using markdown AST for robust section extraction.
*/
import { unified } from "unified";
import remarkParse from "remark-parse";
import remarkRehype from "remark-rehype";
import rehypeStringify from "rehype-stringify";
import { getIconData, iconToSVG, iconToHTML } from "@iconify/utils";
import heroicons from "@iconify-json/heroicons/icons.json";
import type { Root, RootContent, Heading, List, ListItem, Html } from "mdast";
import type { Root as HastRoot } from "hast";
export interface TocItem {
id: string;
title: string;
level: number;
clause?: string;
}
export interface FAQItem {
id: string;
question: string;
answer: string;
}
export interface SpecSection {
id: string;
title: string;
content: string;
clause: string;
}
export interface ParsedSpec {
introduction: string;
summary: string;
terminology: string;
terminologyTitle: string;
specification: string;
specificationTitle: string;
specSections: SpecSection[];
faq: FAQItem[];
license: string;
tocItems: TocItem[];
}
/**
* Convert text to a URL-friendly ID
*/
function slugify(text: string): string {
return text
.toLowerCase()
.replace(/[^\w\s-]/g, "")
.replace(/\s+/g, "-")
.trim();
}
/**
* Generate link icon SVG from heroicons icon set
*/
function generateLinkIconSvg(): string {
const iconData = getIconData(heroicons, "link");
if (!iconData) {
return "";
}
const result = iconToSVG(iconData);
return iconToHTML(result.body, {
...result.attributes,
class: "clause-link-icon",
stroke: "currentColor",
"stroke-width": "2",
});
}
type MdastNode = Root | RootContent;
/**
* Extract plain text from an mdast node tree
*/
function extractText(node: MdastNode): string {
if ("value" in node && typeof node.value === "string") {
return node.value;
}
if ("children" in node && Array.isArray(node.children)) {
return node.children.map((child) => extractText(child)).join("");
}
return "";
}
/**
* Find index of heading containing specific text
*/
function findHeadingIndex(
nodes: RootContent[],
text: string,
depth: number = 2,
): number {
return nodes.findIndex(
(node) =>
node.type === "heading" &&
(node as Heading).depth === depth &&
extractText(node).toLowerCase().includes(text.toLowerCase()),
);
}
/**
* Extract nodes between two headings
*/
function extractSectionNodes(
nodes: RootContent[],
startText: string,
depth: number = 2,
): RootContent[] {
const startIdx = findHeadingIndex(nodes, startText, depth);
if (startIdx === -1) return [];
// Find the next heading of same or higher level
let endIdx = nodes.length;
for (let i = startIdx + 1; i < nodes.length; i++) {
const node = nodes[i];
if (node.type === "heading" && (node as Heading).depth <= depth) {
endIdx = i;
break;
}
}
// Return nodes after the heading (not including the heading itself)
return nodes.slice(startIdx + 1, endIdx);
}
/**
* Get the full heading text
*/
function getHeadingText(
nodes: RootContent[],
text: string,
depth: number = 2,
): string {
const idx = findHeadingIndex(nodes, text, depth);
if (idx === -1) return text;
return extractText(nodes[idx]);
}
/**
* Convert mdast nodes to HTML string
*/
async function nodesToHtml(nodes: RootContent[]): Promise<string> {
if (nodes.length === 0) return "";
// Create a root node with these children
const root: Root = { type: "root", children: nodes };
const result = await unified()
.use(remarkRehype, { allowDangerousHtml: true })
.use(rehypeStringify, { allowDangerousHtml: true })
.run(root);
const html = unified()
.use(rehypeStringify, { allowDangerousHtml: true })
.stringify(result as HastRoot);
return html;
}
/**
* Extract top-level list item titles from an ordered list
*/
function extractListItemTitles(list: List): string[] {
const titles: string[] = [];
for (const item of list.children) {
if (item.type !== "listItem") continue;
// Get the first paragraph or text content of the list item
// The title is the text before any nested list
let title = "";
for (const child of item.children) {
if (child.type === "list") break; // Stop at nested list
if (child.type === "paragraph") {
title = extractText(child);
break;
}
// Handle inline text directly in list item
title += extractText(child);
}
title = title.split("\n")[0].trim();
if (title) {
titles.push(title);
}
}
return titles;
}
/**
* Find the first ordered list in nodes and extract its structure
*/
function findSpecSections(nodes: RootContent[]): SpecSection[] {
const sections: SpecSection[] = [];
for (const node of nodes) {
if (node.type === "list" && (node as List).ordered) {
const titles = extractListItemTitles(node as List);
for (let i = 0; i < titles.length; i++) {
const title = titles[i];
const clauseNum = i + 1;
sections.push({
id: `clause-${clauseNum}`,
title,
content: "",
clause: `${clauseNum}.`,
});
}
break; // Only process first ordered list
}
}
return sections;
}
/**
* Add anchor IDs and links to ordered list items recursively.
* Injects an invisible anchor link before content for hover-to-reveal behavior.
*/
function addClauseAnchors(list: List, prefix: string = ""): void {
for (let i = 0; i < list.children.length; i++) {
const item = list.children[i];
if (item.type !== "listItem") continue;
// Calculate clause number and ID
const clauseNum = prefix ? `${prefix}.${i + 1}` : `${i + 1}`;
const clauseId = `clause-${clauseNum.replace(/\./g, "-")}`;
// Add ID to the list item via hProperties
(item as ListItem & { data?: { hProperties?: { id?: string } } }).data = {
hProperties: { id: clauseId },
};
// Find the first paragraph in the item and prepend an anchor link
for (const child of item.children) {
if (child.type === "paragraph") {
// Create anchor link HTML with clause number text and link icon
const linkIcon = generateLinkIconSvg();
const anchorHtml: Html = {
type: "html",
value: `<a href="#${clauseId}" class="clause-link" aria-hidden="true">${linkIcon}${clauseNum}.</a>`,
};
// Prepend anchor to paragraph children
(child as { children: RootContent[] }).children.unshift(
anchorHtml as unknown as RootContent,
);
break;
}
}
// Recursively process nested ordered lists
for (const child of item.children) {
if (child.type === "list" && (child as List).ordered) {
addClauseAnchors(child as List, clauseNum);
}
}
}
}
/**
* Extract FAQ items from FAQ section nodes
*/
function extractFAQFromNodes(nodes: RootContent[]): FAQItem[] {
const items: FAQItem[] = [];
let currentQuestion = "";
let currentId = "";
for (const node of nodes) {
if (node.type === "heading" && (node as Heading).depth === 3) {
// Save previous FAQ item if we had one
if (currentQuestion) {
items.push({
id: currentId,
question: currentQuestion,
answer: "", // Placeholder, will be filled later
});
}
currentQuestion = extractText(node);
currentId = `faq-${slugify(currentQuestion).slice(0, 50)}`;
}
}
// Don't forget the last item
if (currentQuestion) {
items.push({
id: currentId,
question: currentQuestion,
answer: "",
});
}
return items;
}
/**
* Build table of contents from parsed sections
*/
function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
const items: TocItem[] = [];
if (parsed.terminology) {
items.push({
id: "terminology",
title: parsed.terminologyTitle || "Terminology",
level: 2,
});
}
if (parsed.specification) {
items.push({
id: "specification",
title: "Specification",
level: 2,
});
if (parsed.specSections) {
for (const section of parsed.specSections) {
items.push({
id: section.id,
title: section.title,
level: 3,
clause: section.clause,
});
}
}
}
return items;
}
/**
* Main parsing function - takes markdown content and returns structured content
*/
export async function parseSpecContent(
markdown: string,
): Promise<ParsedSpec> {
// Parse markdown to AST
const tree = unified().use(remarkParse).parse(markdown) as Root;
// Remove title (h1) from the tree - it's displayed separately in the Hero
const nodes = tree.children.filter((node) => {
if (node.type === "heading" && (node as Heading).depth === 1) return false;
return true;
});
// Get heading titles
const terminologyTitle = getHeadingText(nodes, "Terminology");
const specificationTitle = getHeadingText(
nodes,
"Git Common-Flow Specification",
);
// Extract section nodes
const introNodes = extractSectionNodes(nodes, "Introduction");
const summaryNodes = extractSectionNodes(nodes, "Summary");
const terminologyNodes = extractSectionNodes(nodes, "Terminology");
const specNodes = extractSectionNodes(nodes, "Git Common-Flow Specification");
const faqNodes = extractSectionNodes(nodes, "FAQ");
const licenseNodes = extractSectionNodes(nodes, "License");
// Extract spec sections from the first ordered list
const specSections = findSpecSections(specNodes);
// Add anchor IDs and links to spec list items
for (const node of specNodes) {
if (node.type === "list" && (node as List).ordered) {
addClauseAnchors(node as List);
break;
}
}
// Extract FAQ items structure
const faqItems = extractFAQFromNodes(faqNodes);
// Collect FAQ answer nodes for each item
const faqAnswerNodes: RootContent[][] = [];
let currentAnswerNodes: RootContent[] = [];
for (const node of faqNodes) {
if (node.type === "heading" && (node as Heading).depth === 3) {
if (currentAnswerNodes.length > 0) {
faqAnswerNodes.push(currentAnswerNodes);
}
currentAnswerNodes = [];
} else {
currentAnswerNodes.push(node);
}
}
// Don't forget the last answer
if (currentAnswerNodes.length > 0) {
faqAnswerNodes.push(currentAnswerNodes);
}
// Convert sections to HTML
const [introduction, summary, terminology, specification, license] =
await Promise.all([
nodesToHtml(introNodes),
nodesToHtml(summaryNodes),
nodesToHtml(terminologyNodes),
nodesToHtml(specNodes),
nodesToHtml(licenseNodes),
]);
// Convert FAQ answers to HTML
const faqAnswers = await Promise.all(
faqAnswerNodes.map((nodes) => nodesToHtml(nodes)),
);
// Assign FAQ answers
const faq = faqItems.map((item, i) => ({
...item,
answer: faqAnswers[i] || "",
}));
const parsed: ParsedSpec = {
introduction,
summary,
terminology,
terminologyTitle,
specification,
specificationTitle,
specSections,
faq,
license,
tocItems: [],
};
parsed.tocItems = buildTocItems(parsed);
return parsed;
}