From 208219ca2c8f9e2416dbc6544b095c115c245cda Mon Sep 17 00:00:00 2001 From: Jim Myhrberg Date: Sat, 10 Jan 2026 21:52:45 +0000 Subject: [PATCH] wip: improve spec parsing --- docs/index.html | 4 +- docs/spec/1.0.0-rc.1/index.html | 14 +- docs/spec/1.0.0-rc.2/index.html | 14 +- docs/spec/1.0.0-rc.3/index.html | 14 +- docs/spec/1.0.0-rc.4/index.html | 4 +- docs/spec/1.0.0-rc.5/index.html | 4 +- src/components/SpecSection.astro | 7 +- src/pages/index.astro | 28 +- src/pages/spec/[version].astro | 28 +- src/utils/parseSpecContent.ts | 462 +++++++++++++++++++------------ 10 files changed, 324 insertions(+), 255 deletions(-) diff --git a/docs/index.html b/docs/index.html index 585c109..9720eab 100644 --- a/docs/index.html +++ b/docs/index.html @@ -199,7 +199,7 @@ its merge target, allowing others to review, discuss and approve the changes.
  • Release Branches - Used both for short-term preparations of a release, and also for long-term maintenance of older version.
  • -

    Git Common-Flow Specification

    The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +

    Specification

    The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119.

      @@ -387,7 +387,7 @@ in question. Meaning it MUST always be in a non-broken state, MUST NOT be force pushed to, etc.
    -
  • Bug Fixes & Rollback +
  • Bug Fixes & Rollback
    1. You MUST NOT under any circumstances force push to the master branch or to long-term release branches.
    2. diff --git a/docs/spec/1.0.0-rc.1/index.html b/docs/spec/1.0.0-rc.1/index.html index 01766a2..4d9d739 100644 --- a/docs/spec/1.0.0-rc.1/index.html +++ b/docs/spec/1.0.0-rc.1/index.html @@ -161,7 +161,7 @@ The complete Git Common-Flow specification lg:pr-8 lg:mr-8 lg:border-r border-gray-200 dark:border-gray-800" data-astro-cid-lfaoh65k>

      Terminology

        +

      Terminology

      • Master Branch - Must always have passing tests, is considered bleeding edge, and must be named master.
      • Change Branches - Any branch that introduces changes like a new feature, a @@ -193,7 +193,7 @@ and a git tag named according to the new version string placed on said commit.Maintenance Release - Just like a regular release, except the version bump commit and release tag are on a maintenance branch instead of the master branch.
      • -

      Git Common-Flow Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +

      Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119.

        @@ -209,7 +209,7 @@ release/production" state to reduce the friction of creating a new release.
      -
    3. Changes +
    4. Changes
      1. Changes MUST be performed on a separate branch that SHOULD be referred to as a "change branch". All change branches MUST have descriptive names. It @@ -219,7 +219,7 @@ push your work to the same named branch on the remote server.
      2. SHOULD be referred to as the "source branch". Each change branch also needs a designated "merge target branch", typically this will be the same as the source branch. -
      3. Change branches MUST be regularly updated with any changes from their +
      4. Change branches MUST be regularly updated with any changes from their source branch. This MUST be done by rebasing the change branch on top of the source branch. To be clear you MUST NOT merge a source branch into a change branch.
      5. @@ -293,7 +293,7 @@ of the tag annotation would read "Release 2.11.4". The second line must be blank, and the changelog MUST start on the third line.
    5. -
    6. Bug Fixes & Rollback +
    7. Bug Fixes & Rollback
      1. You MUST NOT under any circumstances force push to the master branch.
      2. If a change branch which has been merged in to the master branch is found @@ -306,7 +306,7 @@ merge commit itself. Effectively creating a new commit that reverses all the relevant changes.
    8. -
    9. Maintenance Releases +
    10. Maintenance Releases
      1. Any branch that has a name starting with "stable-" SHOULD be referred to as a "maintenance branch".
      2. diff --git a/docs/spec/1.0.0-rc.2/index.html b/docs/spec/1.0.0-rc.2/index.html index dc9d1f0..e312ce6 100644 --- a/docs/spec/1.0.0-rc.2/index.html +++ b/docs/spec/1.0.0-rc.2/index.html @@ -161,7 +161,7 @@ The complete Git Common-Flow specification lg:pr-8 lg:mr-8 lg:border-r border-gray-200 dark:border-gray-800" data-astro-cid-lfaoh65k>
    11. Terminology

        +

      Terminology

      • Master Branch - Must always have passing tests, is considered bleeding edge, and must be named master.
      • Change Branches - Any branch that introduces changes like a new feature, a @@ -188,7 +188,7 @@ its merge target, allowing others to review, discuss and approve the changes.
      • Release Branches - Used both for short-term preparations of a release, and also for long-term maintenance of older version.
      • -

      Git Common-Flow Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +

      Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119.

        @@ -277,7 +277,7 @@ of the tag annotation would read "Release 2.11.4". The second line must be blank, and the changelog MUST start on the third line.
      -
    12. Release Branches +
    13. Release Branches
      1. Any branch that has a name starting with "release-" SHOULD be referred to as a "release branch".
      2. @@ -288,7 +288,7 @@ downwards from the master branch. If a change needs to trickle back up into the master branch, that work should have happened against the master branch in the first place. One exception to this is version bump commits.
      3. There are two types of release branches; short-term, and long-term.
      4. -
      5. Short-Term Release Branches +
      6. Short-Term Release Branches
        1. Used for creating a specific versioned release.
        2. A short-term release branch is RECOMMENDED if there is a lengthy @@ -308,7 +308,7 @@ release branch MUST be merged back into its source branch and then deleted. Typically the source branch will be the master branch.
      7. -
      8. Long-Term Release Branches +
      9. Long-Term Release Branches
        1. Used for work on versions which are not currently part of the master branch. Typically this is useful when you need to create a new @@ -329,7 +329,7 @@ release tag. The security fix release will then end up being version
      10. -
      11. Bug Fixes & Rollback +
      12. Bug Fixes & Rollback
        1. You MUST NOT under any circumstances force push to the master branch.
        2. If a change branch which has been merged into the master branch is found diff --git a/docs/spec/1.0.0-rc.3/index.html b/docs/spec/1.0.0-rc.3/index.html index 433e0a1..17a1f06 100644 --- a/docs/spec/1.0.0-rc.3/index.html +++ b/docs/spec/1.0.0-rc.3/index.html @@ -161,7 +161,7 @@ The complete Git Common-Flow specification lg:pr-8 lg:mr-8 lg:border-r border-gray-200 dark:border-gray-800" data-astro-cid-lfaoh65k>
    14. Terminology

        +

      Terminology

      • Master Branch - Must be named "master", must always have passing tests, and is not guaranteed to always work in production environments.
      • Change Branches - Any branch that introduces changes like a new feature, a @@ -189,7 +189,7 @@ environments. Consists of a version bump commit, and a git tag named according to the new version string placed on said commit.
      • Release Branches - Used both for short-term preparations of a release, and also for long-term maintenance of older version.
      • -

      Git Common-Flow Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +

      Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119.

        @@ -286,7 +286,7 @@ of the tag annotation would read "Release 2.11.4". The second line must be blank, and the changelog MUST start on the third line.
      -
    15. Release Branches +
    16. Release Branches
      1. Any branch that has a name starting with "release-" SHOULD be referred to as a "release branch".
      2. @@ -297,7 +297,7 @@ downwards from the master branch. If a change needs to trickle back up into the master branch, that work should have happened against the master branch in the first place. One exception to this is version bump commits.
      3. There are two types of release branches; short-term, and long-term.
      4. -
      5. Short-Term Release Branches +
      6. Short-Term Release Branches
        1. Used for creating a specific versioned release.
        2. A short-term release branch is RECOMMENDED if there is a lengthy @@ -317,7 +317,7 @@ release branch MUST be merged back into its source branch and then deleted. Typically the source branch will be the master branch.
      7. -
      8. Long-Term Release Branches +
      9. Long-Term Release Branches
        1. Used for work on versions which are not currently part of the master branch. Typically this is useful when you need to create a new @@ -338,7 +338,7 @@ release tag. The security fix release will then end up being version
      10. -
      11. Bug Fixes & Rollback +
      12. Bug Fixes & Rollback
        1. You MUST NOT under any circumstances force push to the master branch.
        2. If a change branch which has been merged into the master branch is found diff --git a/docs/spec/1.0.0-rc.4/index.html b/docs/spec/1.0.0-rc.4/index.html index 9c710a1..76a7181 100644 --- a/docs/spec/1.0.0-rc.4/index.html +++ b/docs/spec/1.0.0-rc.4/index.html @@ -188,7 +188,7 @@ its merge target, allowing others to review, discuss and approve the changes.
        3. Release Branches - Used both for short-term preparations of a release, and also for long-term maintenance of older version.
        4. -
    17. Git Common-Flow Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +

      Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119.

        @@ -375,7 +375,7 @@ in question. Meaning it MUST always be in a non-broken state, MUST NOT be force pushed to, etc.
      -
    18. Bug Fixes & Rollback +
    19. Bug Fixes & Rollback
      1. You MUST NOT under any circumstances force push to the master branch or to long-term release branches.
      2. diff --git a/docs/spec/1.0.0-rc.5/index.html b/docs/spec/1.0.0-rc.5/index.html index 585c109..9720eab 100644 --- a/docs/spec/1.0.0-rc.5/index.html +++ b/docs/spec/1.0.0-rc.5/index.html @@ -199,7 +199,7 @@ its merge target, allowing others to review, discuss and approve the changes.
      3. Release Branches - Used both for short-term preparations of a release, and also for long-term maintenance of older version.
      4. -
    20. Git Common-Flow Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", +

      Specification

      The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in RFC 2119.

        @@ -387,7 +387,7 @@ in question. Meaning it MUST always be in a non-broken state, MUST NOT be force pushed to, etc.
      -
    21. Bug Fixes & Rollback +
    22. Bug Fixes & Rollback
      1. You MUST NOT under any circumstances force push to the master branch or to long-term release branches.
      2. diff --git a/src/components/SpecSection.astro b/src/components/SpecSection.astro index 7d878c9..65c436a 100644 --- a/src/components/SpecSection.astro +++ b/src/components/SpecSection.astro @@ -4,11 +4,12 @@ import type { TocItem } from "../utils/parseSpecContent"; interface Props { terminology: string; + terminologyTitle: string; specification: string; tocItems: TocItem[]; } -const { terminology, specification, tocItems } = Astro.props; +const { terminology, terminologyTitle, specification, tocItems } = Astro.props; ---
        @@ -33,13 +34,13 @@ const { terminology, specification, tocItems } = Astro.props;
        -

        Terminology

        +

        {terminologyTitle}

        -

        Git Common-Flow Specification

        +

        Specification

        diff --git a/src/pages/index.astro b/src/pages/index.astro index 0868461..507eec3 100644 --- a/src/pages/index.astro +++ b/src/pages/index.astro @@ -2,10 +2,6 @@ import { getCollection } from "astro:content"; import * as fs from "node:fs"; import * as path from "node:path"; -import { unified } from "unified"; -import remarkParse from "remark-parse"; -import remarkRehype from "remark-rehype"; -import rehypeStringify from "rehype-stringify"; import SinglePage from "../layouts/SinglePage.astro"; import Header from "../components/Header.astro"; @@ -25,28 +21,15 @@ if (!spec) { throw new Error(`Spec version ${version} not found`); } -// Read and process the markdown file -const filePath = path.join( - process.cwd(), - "src/content/spec", - `${version}.md` -); +// Read the markdown file +const filePath = path.join(process.cwd(), "src/content/spec", `${version}.md`); const content = fs.readFileSync(filePath, "utf-8"); // Remove frontmatter -const body = content.replace(/^---[\s\S]*?---\n/, ""); +const markdown = content.replace(/^---[\s\S]*?---\n/, ""); -// Process markdown to HTML -const result = await unified() - .use(remarkParse) - .use(remarkRehype, { allowDangerousHtml: true }) - .use(rehypeStringify, { allowDangerousHtml: true }) - .process(body); - -const html = String(result); - -// Parse the content into sections -const parsed = parseSpecContent(html, version); +// Parse the content into sections (handles markdown -> HTML internally) +const parsed = await parseSpecContent(markdown, version); --- @@ -64,6 +47,7 @@ const parsed = parseSpecContent(html, version); diff --git a/src/pages/spec/[version].astro b/src/pages/spec/[version].astro index 44f8330..c094b5c 100644 --- a/src/pages/spec/[version].astro +++ b/src/pages/spec/[version].astro @@ -2,10 +2,6 @@ import { getCollection } from "astro:content"; import * as fs from "node:fs"; import * as path from "node:path"; -import { unified } from "unified"; -import remarkParse from "remark-parse"; -import remarkRehype from "remark-rehype"; -import rehypeStringify from "rehype-stringify"; import SinglePage from "../../layouts/SinglePage.astro"; import Header from "../../components/Header.astro"; @@ -27,28 +23,15 @@ export async function getStaticPaths() { const { spec } = Astro.props; const version = spec.data.version; -// Read and process the markdown file -const filePath = path.join( - process.cwd(), - "src/content/spec", - `${version}.md` -); +// Read the markdown file +const filePath = path.join(process.cwd(), "src/content/spec", `${version}.md`); const content = fs.readFileSync(filePath, "utf-8"); // Remove frontmatter -const body = content.replace(/^---[\s\S]*?---\n/, ""); +const markdown = content.replace(/^---[\s\S]*?---\n/, ""); -// Process markdown to HTML -const result = await unified() - .use(remarkParse) - .use(remarkRehype, { allowDangerousHtml: true }) - .use(rehypeStringify, { allowDangerousHtml: true }) - .process(body); - -const html = String(result); - -// Parse the content into sections -const parsed = parseSpecContent(html, version); +// Parse the content into sections (handles markdown -> HTML internally) +const parsed = await parseSpecContent(markdown, version); --- @@ -66,6 +49,7 @@ const parsed = parseSpecContent(html, version); diff --git a/src/utils/parseSpecContent.ts b/src/utils/parseSpecContent.ts index e4609c0..1d7f864 100644 --- a/src/utils/parseSpecContent.ts +++ b/src/utils/parseSpecContent.ts @@ -1,8 +1,14 @@ /** - * Parses rendered spec HTML into structured sections for the single-page - * layout. + * Parses spec content using markdown AST for robust section extraction. */ +import { unified } from "unified"; +import remarkParse from "remark-parse"; +import remarkRehype from "remark-rehype"; +import rehypeStringify from "rehype-stringify"; +import type { Root, RootContent, Heading, List, ListItem } from "mdast"; +import type { Root as HastRoot } from "hast"; + export interface TocItem { id: string; title: string; @@ -26,7 +32,9 @@ export interface ParsedSpec { introduction: string; summary: string; terminology: string; + terminologyTitle: string; specification: string; + specificationTitle: string; specSections: SpecSection[]; faq: FAQItem[]; about: string; @@ -35,7 +43,7 @@ export interface ParsedSpec { } /** - * Convert a heading text to a URL-friendly ID + * Convert text to a URL-friendly ID */ function slugify(text: string): string { return text @@ -45,160 +53,212 @@ function slugify(text: string): string { .trim(); } +type MdastNode = Root | RootContent; + /** - * Extract content between two headings or to the end of the document + * Extract plain text from an mdast node tree */ -function extractSection( - html: string, - startHeading: string, - endHeadings: string[] = [] -): string { - // Find the heading (h2) - use partial match to handle additional text - // e.g., "Git Common-Flow Specification (Common-Flow)" - const headingPattern = new RegExp( - `]*>[^<]*${escapeRegex(startHeading)}[^<]*`, - "i" +function extractText(node: MdastNode): string { + if ("value" in node && typeof node.value === "string") { + return node.value; + } + if ("children" in node && Array.isArray(node.children)) { + return node.children.map((child) => extractText(child)).join(""); + } + return ""; +} + +/** + * Find index of heading containing specific text + */ +function findHeadingIndex( + nodes: RootContent[], + text: string, + depth: number = 2 +): number { + return nodes.findIndex( + (node) => + node.type === "heading" && + (node as Heading).depth === depth && + extractText(node).toLowerCase().includes(text.toLowerCase()) ); - const match = html.match(headingPattern); - if (!match || match.index === undefined) return ""; - - const startIdx = match.index + match[0].length; - - // Find the next section heading - let endIdx = html.length; - for (const endHeading of endHeadings) { - const endPattern = new RegExp( - `]*>\\s*${escapeRegex(endHeading)}\\s*`, - "i" - ); - const endMatch = html.slice(startIdx).match(endPattern); - if (endMatch && endMatch.index !== undefined) { - const possibleEnd = startIdx + endMatch.index; - if (possibleEnd < endIdx) { - endIdx = possibleEnd; - } - } - } - - // Also check for any h2 as a fallback - const anyH2 = html.slice(startIdx).match(/]*>/i); - if (anyH2 && anyH2.index !== undefined) { - const possibleEnd = startIdx + anyH2.index; - if (possibleEnd < endIdx) { - endIdx = possibleEnd; - } - } - - return html.slice(startIdx, endIdx).trim(); } -function escapeRegex(str: string): string { - return str.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); -} - -// Spec section titles in order (used for both ToC and anchor injection) -const SPEC_SECTION_TITLES = [ - "TL;DR", - "The Master Branch", - "Change Branches", - "Pull Requests", - "Versioning", - "Releases", - "Short-Term Release Branches", - "Long-term Release Branches", - "Bug Fixes & Rollback", - "Git Best Practices", -]; - /** - * Extract the numbered spec sections (1. TL;DR, 2. The Master Branch, etc.) + * Extract nodes between two headings */ -function extractSpecSections(specContent: string): SpecSection[] { +function extractSectionNodes( + nodes: RootContent[], + startText: string, + depth: number = 2 +): RootContent[] { + const startIdx = findHeadingIndex(nodes, startText, depth); + if (startIdx === -1) return []; + + // Find the next heading of same or higher level + let endIdx = nodes.length; + for (let i = startIdx + 1; i < nodes.length; i++) { + const node = nodes[i]; + if (node.type === "heading" && (node as Heading).depth <= depth) { + endIdx = i; + break; + } + } + + // Return nodes after the heading (not including the heading itself) + return nodes.slice(startIdx + 1, endIdx); +} + +/** + * Get the full heading text + */ +function getHeadingText( + nodes: RootContent[], + text: string, + depth: number = 2 +): string { + const idx = findHeadingIndex(nodes, text, depth); + if (idx === -1) return text; + return extractText(nodes[idx]); +} + +/** + * Convert mdast nodes to HTML string + */ +async function nodesToHtml(nodes: RootContent[]): Promise { + if (nodes.length === 0) return ""; + + // Create a root node with these children + const root: Root = { type: "root", children: nodes }; + + const result = await unified() + .use(remarkRehype, { allowDangerousHtml: true }) + .use(rehypeStringify, { allowDangerousHtml: true }) + .run(root); + + const html = await unified() + .use(rehypeStringify, { allowDangerousHtml: true }) + .stringify(result as HastRoot); + + return html; +} + +/** + * Extract top-level list item titles from an ordered list + */ +function extractListItemTitles(list: List): string[] { + const titles: string[] = []; + + for (const item of list.children) { + if (item.type !== "listItem") continue; + + // Get the first paragraph or text content of the list item + // The title is the text before any nested list + let title = ""; + for (const child of item.children) { + if (child.type === "list") break; // Stop at nested list + if (child.type === "paragraph") { + title = extractText(child); + break; + } + // Handle inline text directly in list item + title += extractText(child); + } + + title = title.split("\n")[0].trim(); + if (title) { + titles.push(title); + } + } + + return titles; +} + +/** + * Find the first ordered list in nodes and extract its structure + */ +function findSpecSections(nodes: RootContent[]): SpecSection[] { const sections: SpecSection[] = []; - // The spec uses an ordered list with nested items - // Each top-level li starts a new section - const olMatch = specContent.match(/]*>([\s\S]*?)<\/ol>/i); - if (!olMatch) return sections; - - // Find each section by looking for the title pattern - for (const title of SPEC_SECTION_TITLES) { - const id = slugify(title); - - // For the content, we'll just use the title for navigation - // The actual content stays in the main specification block - sections.push({ - id: `spec-${id}`, - title, - content: "", // Content handled inline - }); + for (const node of nodes) { + if (node.type === "list" && (node as List).ordered) { + const titles = extractListItemTitles(node as List); + for (const title of titles) { + sections.push({ + id: `spec-${slugify(title)}`, + title, + content: "", + }); + } + break; // Only process first ordered list + } } return sections; } /** - * Add anchor IDs to spec section list items. - * Finds top-level
      3. elements that start with section titles and adds IDs. + * Add anchor IDs to list items in the spec ordered list */ -function addSpecSectionAnchors(specContent: string): string { - let result = specContent; +function addAnchorsToList(list: List, sections: SpecSection[]): void { + const titleMap = new Map(sections.map((s) => [s.title, s.id])); - for (const title of SPEC_SECTION_TITLES) { - const id = `spec-${slugify(title)}`; - // Match
      4. followed by the section title (possibly with whitespace) - // The title appears right after
      5. in the rendered HTML - const pattern = new RegExp( - `(
      6. )(\\s*${escapeRegex(title)})`, - "i" - ); - result = result.replace(pattern, `
      7. $2`); + for (const item of list.children) { + if (item.type !== "listItem") continue; + + // Get the title of this item + let title = ""; + for (const child of item.children) { + if (child.type === "list") break; + if (child.type === "paragraph") { + title = extractText(child).split("\n")[0].trim(); + break; + } + title += extractText(child); + } + title = title.split("\n")[0].trim(); + + // Add ID as data attribute (will be processed by rehype) + const id = titleMap.get(title); + if (id) { + // Add hProperties for rehype to convert to HTML id attribute + (item as ListItem & { data?: { hProperties?: { id?: string } } }).data = { + hProperties: { id }, + }; + } } - - return result; } /** - * Extract FAQ items from the FAQ section HTML + * Extract FAQ items from FAQ section nodes */ -function extractFAQItems(faqContent: string): FAQItem[] { +function extractFAQFromNodes(nodes: RootContent[]): FAQItem[] { const items: FAQItem[] = []; + let currentQuestion = ""; + let currentId = ""; - // Split by h3 headings - const h3Pattern = /]*>([\s\S]*?)<\/h3>/gi; - let lastIndex = 0; - let lastQuestion = ""; - let lastId = ""; + for (const node of nodes) { + if (node.type === "heading" && (node as Heading).depth === 3) { + // Save previous FAQ item if we had one + if (currentQuestion) { + items.push({ + id: currentId, + question: currentQuestion, + answer: "", // Placeholder, will be filled later + }); + } - const matches = [...faqContent.matchAll(h3Pattern)]; - - for (let i = 0; i < matches.length; i++) { - const match = matches[i]; - const question = match[1].replace(/<[^>]+>/g, "").trim(); - const id = slugify(question).slice(0, 50); - - if (i > 0 && match.index !== undefined) { - // Get content between previous h3 and this one - const answer = faqContent.slice(lastIndex, match.index).trim(); - items.push({ - id: `faq-${lastId}`, - question: lastQuestion, - answer, - }); + currentQuestion = extractText(node); + currentId = `faq-${slugify(currentQuestion).slice(0, 50)}`; } - - lastQuestion = question; - lastId = id; - lastIndex = match.index! + match[0].length; } - // Don't forget the last FAQ item - if (lastQuestion) { - const answer = faqContent.slice(lastIndex).trim(); + // Don't forget the last item + if (currentQuestion) { items.push({ - id: `faq-${lastId}`, - question: lastQuestion, - answer, + id: currentId, + question: currentQuestion, + answer: "", }); } @@ -206,20 +266,25 @@ function extractFAQItems(faqContent: string): FAQItem[] { } /** - * Build table of contents from parsed sections. - * Only includes sections rendered in SpecSection (Terminology + Specification). - * Introduction/Summary are in AboutSection and excluded from this ToC. + * Build table of contents from parsed sections */ function buildTocItems(parsed: Partial): TocItem[] { const items: TocItem[] = []; if (parsed.terminology) { - items.push({ id: "terminology", title: "Terminology", level: 2 }); + items.push({ + id: "terminology", + title: parsed.terminologyTitle || "Terminology", + level: 2, + }); } if (parsed.specification) { - items.push({ id: "specification", title: "Specification", level: 2 }); + items.push({ + id: "specification", + title: "Specification", + level: 2, + }); - // Add spec subsections if (parsed.specSections) { for (const section of parsed.specSections) { items.push({ id: section.id, title: section.title, level: 3 }); @@ -231,70 +296,106 @@ function buildTocItems(parsed: Partial): TocItem[] { } /** - * Main parsing function - takes rendered HTML and returns structured content + * Main parsing function - takes markdown content and returns structured content */ -export function parseSpecContent(html: string, version: string): ParsedSpec { +export async function parseSpecContent( + markdown: string, + version: string +): Promise { const svgPath = `/spec/${version}.svg`; - // Remove the title (h1) and SVG from the content for parsing - let content = html; + // Parse markdown to AST + const tree = unified().use(remarkParse).parse(markdown) as Root; - // Remove the h1 title - content = content.replace(/]*>[\s\S]*?<\/h1>/i, ""); + // Remove title (h1) and SVG image from the tree + const nodes = tree.children.filter((node) => { + if (node.type === "heading" && (node as Heading).depth === 1) return false; + if (node.type === "paragraph") { + const text = extractText(node); + if (text.includes(".svg")) return false; + } + return true; + }); - // Remove the SVG img tag - content = content.replace(/]*\.svg[^>]*>/i, ""); - - // Extract each section - const introduction = extractSection(content, "Introduction", [ - "Summary", - "Terminology", - "Git Common-Flow", - "FAQ", - "About", - "License", - ]); - - const summary = extractSection(content, "Summary", [ - "Terminology", - "Git Common-Flow", - "FAQ", - "About", - "License", - ]); - - const terminology = extractSection(content, "Terminology", [ - "Git Common-Flow", - "FAQ", - "About", - "License", - ]); - - const specificationRaw = extractSection( - content, - "Git Common-Flow Specification", - ["FAQ", "About", "License"] + // Get heading titles + const terminologyTitle = getHeadingText(nodes, "Terminology"); + const specificationTitle = getHeadingText( + nodes, + "Git Common-Flow Specification" ); - // Add anchor IDs to spec section list items for ToC navigation - const specification = addSpecSectionAnchors(specificationRaw); + // Extract section nodes + const introNodes = extractSectionNodes(nodes, "Introduction"); + const summaryNodes = extractSectionNodes(nodes, "Summary"); + const terminologyNodes = extractSectionNodes(nodes, "Terminology"); + const specNodes = extractSectionNodes(nodes, "Git Common-Flow Specification"); + const faqNodes = extractSectionNodes(nodes, "FAQ"); + const aboutNodes = extractSectionNodes(nodes, "About"); + const licenseNodes = extractSectionNodes(nodes, "License"); - const faqContent = extractSection(content, "FAQ", ["About", "License"]); + // Extract spec sections from the first ordered list + const specSections = findSpecSections(specNodes); - const about = extractSection(content, "About", ["License"]); + // Add anchor IDs to spec list items + for (const node of specNodes) { + if (node.type === "list" && (node as List).ordered) { + addAnchorsToList(node as List, specSections); + break; + } + } - const license = extractSection(content, "License", []); + // Extract FAQ items structure + const faqItems = extractFAQFromNodes(faqNodes); - // Parse subsections - const specSections = extractSpecSections(specificationRaw); - const faq = extractFAQItems(faqContent); + // Collect FAQ answer nodes for each item + const faqAnswerNodes: RootContent[][] = []; + let currentAnswerNodes: RootContent[] = []; + + for (const node of faqNodes) { + if (node.type === "heading" && (node as Heading).depth === 3) { + if (currentAnswerNodes.length > 0) { + faqAnswerNodes.push(currentAnswerNodes); + } + currentAnswerNodes = []; + } else { + currentAnswerNodes.push(node); + } + } + // Don't forget the last answer + if (currentAnswerNodes.length > 0) { + faqAnswerNodes.push(currentAnswerNodes); + } + + // Convert sections to HTML + const [introduction, summary, terminology, specification, about, license] = + await Promise.all([ + nodesToHtml(introNodes), + nodesToHtml(summaryNodes), + nodesToHtml(terminologyNodes), + nodesToHtml(specNodes), + nodesToHtml(aboutNodes), + nodesToHtml(licenseNodes), + ]); + + // Convert FAQ answers to HTML + const faqAnswers = await Promise.all( + faqAnswerNodes.map((nodes) => nodesToHtml(nodes)) + ); + + // Assign FAQ answers + const faq = faqItems.map((item, i) => ({ + ...item, + answer: faqAnswers[i] || "", + })); const parsed: ParsedSpec = { svgPath, introduction, summary, terminology, + terminologyTitle, specification, + specificationTitle, specSections, faq, about, @@ -302,7 +403,6 @@ export function parseSpecContent(html: string, version: string): ParsedSpec { tocItems: [], }; - // Build TOC parsed.tocItems = buildTocItems(parsed); return parsed;