wip: improve spec parsing

2026-02-19 05:46:40 +00:00 · 2026-01-10 21:52:45 +00:00
parent f1fa264ed7
commit 208219ca2c
10 changed files with 324 additions and 255 deletions
--- a/src/utils/parseSpecContent.ts
+++ b/src/utils/parseSpecContent.ts
@@ -1,8 +1,14 @@
 /**
- * Parses rendered spec HTML into structured sections for the single-page
- * layout.
+ * Parses spec content using markdown AST for robust section extraction.
 */

+import { unified } from "unified";
+import remarkParse from "remark-parse";
+import remarkRehype from "remark-rehype";
+import rehypeStringify from "rehype-stringify";
+import type { Root, RootContent, Heading, List, ListItem } from "mdast";
+import type { Root as HastRoot } from "hast";
+
 export interface TocItem {
  id: string;
  title: string;
@@ -26,7 +32,9 @@ export interface ParsedSpec {
  introduction: string;
  summary: string;
  terminology: string;
+  terminologyTitle: string;
  specification: string;
+  specificationTitle: string;
  specSections: SpecSection[];
  faq: FAQItem[];
  about: string;
@@ -35,7 +43,7 @@ export interface ParsedSpec {
 }

 /**
- * Convert a heading text to a URL-friendly ID
+ * Convert text to a URL-friendly ID
 */
 function slugify(text: string): string {
  return text
@@ -45,160 +53,212 @@ function slugify(text: string): string {
    .trim();
 }

+type MdastNode = Root | RootContent;
+
 /**
- * Extract content between two headings or to the end of the document
+ * Extract plain text from an mdast node tree
 */
-function extractSection(
-  html: string,
-  startHeading: string,
-  endHeadings: string[] = []
-): string {
-  // Find the heading (h2) - use partial match to handle additional text
-  // e.g., "Git Common-Flow Specification (Common-Flow)"
-  const headingPattern = new RegExp(
-    `<h2[^>]*>[^<]*${escapeRegex(startHeading)}[^<]*</h2>`,
-    "i"
+function extractText(node: MdastNode): string {
+  if ("value" in node && typeof node.value === "string") {
+    return node.value;
+  }
+  if ("children" in node && Array.isArray(node.children)) {
+    return node.children.map((child) => extractText(child)).join("");
+  }
+  return "";
+}
+
+/**
+ * Find index of heading containing specific text
+ */
+function findHeadingIndex(
+  nodes: RootContent[],
+  text: string,
+  depth: number = 2
+): number {
+  return nodes.findIndex(
+    (node) =>
+      node.type === "heading" &&
+      (node as Heading).depth === depth &&
+      extractText(node).toLowerCase().includes(text.toLowerCase())
  );
-  const match = html.match(headingPattern);
-  if (!match || match.index === undefined) return "";
-
-  const startIdx = match.index + match[0].length;
-
-  // Find the next section heading
-  let endIdx = html.length;
-  for (const endHeading of endHeadings) {
-    const endPattern = new RegExp(
-      `<h2[^>]*>\\s*${escapeRegex(endHeading)}\\s*</h2>`,
-      "i"
-    );
-    const endMatch = html.slice(startIdx).match(endPattern);
-    if (endMatch && endMatch.index !== undefined) {
-      const possibleEnd = startIdx + endMatch.index;
-      if (possibleEnd < endIdx) {
-        endIdx = possibleEnd;
-      }
-    }
-  }
-
-  // Also check for any h2 as a fallback
-  const anyH2 = html.slice(startIdx).match(/<h2[^>]*>/i);
-  if (anyH2 && anyH2.index !== undefined) {
-    const possibleEnd = startIdx + anyH2.index;
-    if (possibleEnd < endIdx) {
-      endIdx = possibleEnd;
-    }
-  }
-
-  return html.slice(startIdx, endIdx).trim();
 }

-function escapeRegex(str: string): string {
-  return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-}
-
-// Spec section titles in order (used for both ToC and anchor injection)
-const SPEC_SECTION_TITLES = [
-  "TL;DR",
-  "The Master Branch",
-  "Change Branches",
-  "Pull Requests",
-  "Versioning",
-  "Releases",
-  "Short-Term Release Branches",
-  "Long-term Release Branches",
-  "Bug Fixes & Rollback",
-  "Git Best Practices",
-];
-
 /**
- * Extract the numbered spec sections (1. TL;DR, 2. The Master Branch, etc.)
+ * Extract nodes between two headings
 */
-function extractSpecSections(specContent: string): SpecSection[] {
+function extractSectionNodes(
+  nodes: RootContent[],
+  startText: string,
+  depth: number = 2
+): RootContent[] {
+  const startIdx = findHeadingIndex(nodes, startText, depth);
+  if (startIdx === -1) return [];
+
+  // Find the next heading of same or higher level
+  let endIdx = nodes.length;
+  for (let i = startIdx + 1; i < nodes.length; i++) {
+    const node = nodes[i];
+    if (node.type === "heading" && (node as Heading).depth <= depth) {
+      endIdx = i;
+      break;
+    }
+  }
+
+  // Return nodes after the heading (not including the heading itself)
+  return nodes.slice(startIdx + 1, endIdx);
+}
+
+/**
+ * Get the full heading text
+ */
+function getHeadingText(
+  nodes: RootContent[],
+  text: string,
+  depth: number = 2
+): string {
+  const idx = findHeadingIndex(nodes, text, depth);
+  if (idx === -1) return text;
+  return extractText(nodes[idx]);
+}
+
+/**
+ * Convert mdast nodes to HTML string
+ */
+async function nodesToHtml(nodes: RootContent[]): Promise<string> {
+  if (nodes.length === 0) return "";
+
+  // Create a root node with these children
+  const root: Root = { type: "root", children: nodes };
+
+  const result = await unified()
+    .use(remarkRehype, { allowDangerousHtml: true })
+    .use(rehypeStringify, { allowDangerousHtml: true })
+    .run(root);
+
+  const html = await unified()
+    .use(rehypeStringify, { allowDangerousHtml: true })
+    .stringify(result as HastRoot);
+
+  return html;
+}
+
+/**
+ * Extract top-level list item titles from an ordered list
+ */
+function extractListItemTitles(list: List): string[] {
+  const titles: string[] = [];
+
+  for (const item of list.children) {
+    if (item.type !== "listItem") continue;
+
+    // Get the first paragraph or text content of the list item
+    // The title is the text before any nested list
+    let title = "";
+    for (const child of item.children) {
+      if (child.type === "list") break; // Stop at nested list
+      if (child.type === "paragraph") {
+        title = extractText(child);
+        break;
+      }
+      // Handle inline text directly in list item
+      title += extractText(child);
+    }
+
+    title = title.split("\n")[0].trim();
+    if (title) {
+      titles.push(title);
+    }
+  }
+
+  return titles;
+}
+
+/**
+ * Find the first ordered list in nodes and extract its structure
+ */
+function findSpecSections(nodes: RootContent[]): SpecSection[] {
  const sections: SpecSection[] = [];

-  // The spec uses an ordered list with nested items
-  // Each top-level li starts a new section
-  const olMatch = specContent.match(/<ol[^>]*>([\s\S]*?)<\/ol>/i);
-  if (!olMatch) return sections;
-
-  // Find each section by looking for the title pattern
-  for (const title of SPEC_SECTION_TITLES) {
-    const id = slugify(title);
-
-    // For the content, we'll just use the title for navigation
-    // The actual content stays in the main specification block
-    sections.push({
-      id: `spec-${id}`,
-      title,
-      content: "", // Content handled inline
-    });
+  for (const node of nodes) {
+    if (node.type === "list" && (node as List).ordered) {
+      const titles = extractListItemTitles(node as List);
+      for (const title of titles) {
+        sections.push({
+          id: `spec-${slugify(title)}`,
+          title,
+          content: "",
+        });
+      }
+      break; // Only process first ordered list
+    }
  }

  return sections;
 }

 /**
- * Add anchor IDs to spec section list items.
- * Finds top-level <li> elements that start with section titles and adds IDs.
+ * Add anchor IDs to list items in the spec ordered list
 */
-function addSpecSectionAnchors(specContent: string): string {
-  let result = specContent;
+function addAnchorsToList(list: List, sections: SpecSection[]): void {
+  const titleMap = new Map(sections.map((s) => [s.title, s.id]));

-  for (const title of SPEC_SECTION_TITLES) {
-    const id = `spec-${slugify(title)}`;
-    // Match <li> followed by the section title (possibly with whitespace)
-    // The title appears right after <li> in the rendered HTML
-    const pattern = new RegExp(
-      `(<li>)(\\s*${escapeRegex(title)})`,
-      "i"
-    );
-    result = result.replace(pattern, `<li id="${id}">$2`);
+  for (const item of list.children) {
+    if (item.type !== "listItem") continue;
+
+    // Get the title of this item
+    let title = "";
+    for (const child of item.children) {
+      if (child.type === "list") break;
+      if (child.type === "paragraph") {
+        title = extractText(child).split("\n")[0].trim();
+        break;
+      }
+      title += extractText(child);
+    }
+    title = title.split("\n")[0].trim();
+
+    // Add ID as data attribute (will be processed by rehype)
+    const id = titleMap.get(title);
+    if (id) {
+      // Add hProperties for rehype to convert to HTML id attribute
+      (item as ListItem & { data?: { hProperties?: { id?: string } } }).data = {
+        hProperties: { id },
+      };
+    }
  }
-
-  return result;
 }

 /**
- * Extract FAQ items from the FAQ section HTML
+ * Extract FAQ items from FAQ section nodes
 */
-function extractFAQItems(faqContent: string): FAQItem[] {
+function extractFAQFromNodes(nodes: RootContent[]): FAQItem[] {
  const items: FAQItem[] = [];
+  let currentQuestion = "";
+  let currentId = "";

-  // Split by h3 headings
-  const h3Pattern = /<h3[^>]*>([\s\S]*?)<\/h3>/gi;
-  let lastIndex = 0;
-  let lastQuestion = "";
-  let lastId = "";
+  for (const node of nodes) {
+    if (node.type === "heading" && (node as Heading).depth === 3) {
+      // Save previous FAQ item if we had one
+      if (currentQuestion) {
+        items.push({
+          id: currentId,
+          question: currentQuestion,
+          answer: "", // Placeholder, will be filled later
+        });
+      }

-  const matches = [...faqContent.matchAll(h3Pattern)];
-
-  for (let i = 0; i < matches.length; i++) {
-    const match = matches[i];
-    const question = match[1].replace(/<[^>]+>/g, "").trim();
-    const id = slugify(question).slice(0, 50);
-
-    if (i > 0 && match.index !== undefined) {
-      // Get content between previous h3 and this one
-      const answer = faqContent.slice(lastIndex, match.index).trim();
-      items.push({
-        id: `faq-${lastId}`,
-        question: lastQuestion,
-        answer,
-      });
+      currentQuestion = extractText(node);
+      currentId = `faq-${slugify(currentQuestion).slice(0, 50)}`;
    }
-
-    lastQuestion = question;
-    lastId = id;
-    lastIndex = match.index! + match[0].length;
  }

-  // Don't forget the last FAQ item
-  if (lastQuestion) {
-    const answer = faqContent.slice(lastIndex).trim();
+  // Don't forget the last item
+  if (currentQuestion) {
    items.push({
-      id: `faq-${lastId}`,
-      question: lastQuestion,
-      answer,
+      id: currentId,
+      question: currentQuestion,
+      answer: "",
    });
  }

@@ -206,20 +266,25 @@ function extractFAQItems(faqContent: string): FAQItem[] {
 }

 /**
- * Build table of contents from parsed sections.
- * Only includes sections rendered in SpecSection (Terminology + Specification).
- * Introduction/Summary are in AboutSection and excluded from this ToC.
+ * Build table of contents from parsed sections
 */
 function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
  const items: TocItem[] = [];

  if (parsed.terminology) {
-    items.push({ id: "terminology", title: "Terminology", level: 2 });
+    items.push({
+      id: "terminology",
+      title: parsed.terminologyTitle || "Terminology",
+      level: 2,
+    });
  }
  if (parsed.specification) {
-    items.push({ id: "specification", title: "Specification", level: 2 });
+    items.push({
+      id: "specification",
+      title: "Specification",
+      level: 2,
+    });

-    // Add spec subsections
    if (parsed.specSections) {
      for (const section of parsed.specSections) {
        items.push({ id: section.id, title: section.title, level: 3 });
@@ -231,70 +296,106 @@ function buildTocItems(parsed: Partial<ParsedSpec>): TocItem[] {
 }

 /**
- * Main parsing function - takes rendered HTML and returns structured content
+ * Main parsing function - takes markdown content and returns structured content
 */
-export function parseSpecContent(html: string, version: string): ParsedSpec {
+export async function parseSpecContent(
+  markdown: string,
+  version: string
+): Promise<ParsedSpec> {
  const svgPath = `/spec/${version}.svg`;

-  // Remove the title (h1) and SVG from the content for parsing
-  let content = html;
+  // Parse markdown to AST
+  const tree = unified().use(remarkParse).parse(markdown) as Root;

-  // Remove the h1 title
-  content = content.replace(/<h1[^>]*>[\s\S]*?<\/h1>/i, "");
+  // Remove title (h1) and SVG image from the tree
+  const nodes = tree.children.filter((node) => {
+    if (node.type === "heading" && (node as Heading).depth === 1) return false;
+    if (node.type === "paragraph") {
+      const text = extractText(node);
+      if (text.includes(".svg")) return false;
+    }
+    return true;
+  });

-  // Remove the SVG img tag
-  content = content.replace(/<img[^>]*\.svg[^>]*>/i, "");
-
-  // Extract each section
-  const introduction = extractSection(content, "Introduction", [
-    "Summary",
-    "Terminology",
-    "Git Common-Flow",
-    "FAQ",
-    "About",
-    "License",
-  ]);
-
-  const summary = extractSection(content, "Summary", [
-    "Terminology",
-    "Git Common-Flow",
-    "FAQ",
-    "About",
-    "License",
-  ]);
-
-  const terminology = extractSection(content, "Terminology", [
-    "Git Common-Flow",
-    "FAQ",
-    "About",
-    "License",
-  ]);
-
-  const specificationRaw = extractSection(
-    content,
-    "Git Common-Flow Specification",
-    ["FAQ", "About", "License"]
+  // Get heading titles
+  const terminologyTitle = getHeadingText(nodes, "Terminology");
+  const specificationTitle = getHeadingText(
+    nodes,
+    "Git Common-Flow Specification"
  );

-  // Add anchor IDs to spec section list items for ToC navigation
-  const specification = addSpecSectionAnchors(specificationRaw);
+  // Extract section nodes
+  const introNodes = extractSectionNodes(nodes, "Introduction");
+  const summaryNodes = extractSectionNodes(nodes, "Summary");
+  const terminologyNodes = extractSectionNodes(nodes, "Terminology");
+  const specNodes = extractSectionNodes(nodes, "Git Common-Flow Specification");
+  const faqNodes = extractSectionNodes(nodes, "FAQ");
+  const aboutNodes = extractSectionNodes(nodes, "About");
+  const licenseNodes = extractSectionNodes(nodes, "License");

-  const faqContent = extractSection(content, "FAQ", ["About", "License"]);
+  // Extract spec sections from the first ordered list
+  const specSections = findSpecSections(specNodes);

-  const about = extractSection(content, "About", ["License"]);
+  // Add anchor IDs to spec list items
+  for (const node of specNodes) {
+    if (node.type === "list" && (node as List).ordered) {
+      addAnchorsToList(node as List, specSections);
+      break;
+    }
+  }

-  const license = extractSection(content, "License", []);
+  // Extract FAQ items structure
+  const faqItems = extractFAQFromNodes(faqNodes);

-  // Parse subsections
-  const specSections = extractSpecSections(specificationRaw);
-  const faq = extractFAQItems(faqContent);
+  // Collect FAQ answer nodes for each item
+  const faqAnswerNodes: RootContent[][] = [];
+  let currentAnswerNodes: RootContent[] = [];
+
+  for (const node of faqNodes) {
+    if (node.type === "heading" && (node as Heading).depth === 3) {
+      if (currentAnswerNodes.length > 0) {
+        faqAnswerNodes.push(currentAnswerNodes);
+      }
+      currentAnswerNodes = [];
+    } else {
+      currentAnswerNodes.push(node);
+    }
+  }
+  // Don't forget the last answer
+  if (currentAnswerNodes.length > 0) {
+    faqAnswerNodes.push(currentAnswerNodes);
+  }
+
+  // Convert sections to HTML
+  const [introduction, summary, terminology, specification, about, license] =
+    await Promise.all([
+      nodesToHtml(introNodes),
+      nodesToHtml(summaryNodes),
+      nodesToHtml(terminologyNodes),
+      nodesToHtml(specNodes),
+      nodesToHtml(aboutNodes),
+      nodesToHtml(licenseNodes),
+    ]);
+
+  // Convert FAQ answers to HTML
+  const faqAnswers = await Promise.all(
+    faqAnswerNodes.map((nodes) => nodesToHtml(nodes))
+  );
+
+  // Assign FAQ answers
+  const faq = faqItems.map((item, i) => ({
+    ...item,
+    answer: faqAnswers[i] || "",
+  }));

  const parsed: ParsedSpec = {
    svgPath,
    introduction,
    summary,
    terminology,
+    terminologyTitle,
    specification,
+    specificationTitle,
    specSections,
    faq,
    about,
@@ -302,7 +403,6 @@ export function parseSpecContent(html: string, version: string): ParsedSpec {
    tocItems: [],
  };

-  // Build TOC
  parsed.tocItems = buildTocItems(parsed);

  return parsed;