From bb88b2cd49bdaf2723804473b02e79c63cf45f8a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Fri, 29 Jul 2022 22:35:03 +0000
Subject: [PATCH 01/18] Bump mistune from 0.8.1 to 2.0.3

Bumps [mistune](https://github.com/lepture/mistune) from 0.8.1 to 2.0.3.
- [Release notes](https://github.com/lepture/mistune/releases)
- [Changelog](https://github.com/lepture/mistune/blob/master/docs/changes.rst)
- [Commits](https://github.com/lepture/mistune/compare/v0.8.1...v2.0.3)

---
updated-dependencies:
- dependency-name: mistune
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 1f202e5..c9e10bf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-mistune==0.8.1
+mistune==2.0.3
diff --git a/setup.py b/setup.py
index 2c28d23..088053b 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ def read(f):
 
 
 install_requires = [
-    'mistune <= 1',
+    'mistune < 3',
 ]
 
 tests_require = [

From 6ef989e7c48e381bbcf63c70c5c1c036ea230877 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Thu, 15 Jan 2026 10:15:23 -0800
Subject: [PATCH 02/18] =?UTF-8?q?=F0=9F=93=9D=20docs:=20expand=20README=20?=
 =?UTF-8?q?and=20add=20repository=20guidelines?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AGENTS.md | 28 ++++++++++++++++++++++++++++
 README.md | 40 +++++++++++++++++++++++++++++++++++++++-
 2 files changed, 67 insertions(+), 1 deletion(-)
 create mode 100644 AGENTS.md

diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..bb38229
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,28 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tests are in `tests/`, with shared fixtures in `tests/fixtures/`. Reference PDFs sit in `docs/`. Packaging and tooling are defined in `setup.py`, `setup.cfg`, and the `Makefile`; `CHANGELOG` tracks releases.
+
+## Build, Test, and Development Commands
+- `make env` creates the local `venv/` (Python 3.11+).
+- `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv.
+- `make test` runs linting and the full pytest suite with coverage.
+- `make vtest` runs pytest verbosely.
+- `make flake` runs flake8 on `sdiff/` and `tests/`.
+- `make cov` prints the coverage report.
+- `make clean` removes build artifacts and the venv.
+
+Example flow:
+```sh
+make dev
+make test
+```
+
+## Coding Style & Naming Conventions
+Use standard Python conventions: 4-space indentation, `snake_case` for modules/functions/variables, and `PascalCase` for classes. Flake8 enforces a 120-character line limit (see `setup.cfg`). `autopep8` is available for formatting. Keep new modules in `sdiff/` and new tests in `tests/` with filenames like `test_<area>.py`.
+
+## Testing Guidelines
+The suite uses `pytest` with `coverage`. Coverage is expected to stay high (current config fails under 96%). Add or update tests for behavior changes, and prefer small, focused unit tests. Place reusable data in `tests/fixtures/`. Run `make test` before submitting changes.
+
+## Commit & Pull Request Guidelines
+Commit messages in this repo are short and often use a type prefix (e.g., `chore: ...`, `fixes: ...`, `hotfix: ...`, `refactors: ...`). Follow that pattern where practical, and keep the summary concise. For PRs, include a brief description, list tests run (e.g., `make test`), and link related issues or tickets when available.
diff --git a/README.md b/README.md
index b8bb2a8..7ab5d32 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,40 @@
 # md-sdiff
-Diffs to markdown texts only based on their structure. Ignores content. Helpful to diff 2 files that contain the same content in different languages.
+
+Structural diffs for Markdown. The library parses two Markdown inputs into a lightweight tree and compares the *shape* (headings, lists, paragraphs, links, etc.) instead of the text content. This is useful when you expect the same document structure across translations or when you want to validate formatting consistency without caring about the wording.
+
+## What it does
+- Parses Markdown into an AST-like node tree using `mistune`.
+- Compares trees node-by-node and flags insertions/deletions in structure.
+- Returns a rendered view of each document plus a list of structural errors.
+- Supports a Zendesk-specific parser (`ZendeskHelpMdParser`) for `<callout>`, `<steps>`, and `<tabs>` blocks.
+
+## Example usage
+```python
+from sdiff import diff, TextRenderer, MdParser
+
+left = "# Title\n\n- One\n- Two"
+right = "# Title\n\n- One\n- Two\n- Three"
+
+rendered_left, rendered_right, errors = diff(left, right, renderer=TextRenderer(), parser_cls=MdParser)
+print(errors[0])  # "There is a missing element `li`."
+```
+
+## Renderers
+`TextRenderer` returns the original Markdown structure as text. `HtmlRenderer` wraps the output and marks structural insertions/deletions with `<ins>` and `<del>`.
+
+## One-off usage
+```sh
+python - <<'PY'
+from sdiff import diff, TextRenderer
+
+left = open("left.md", "r", encoding="utf-8").read()
+right = open("right.md", "r", encoding="utf-8").read()
+_, _, errors = diff(left, right, renderer=TextRenderer())
+
+for err in errors:
+    print(err)
+PY
+```
+
+## Notes
+This project is a library (no CLI). If you need different token handling, you can provide a custom parser class that extends `MdParser`.

From bf349a49639560336b929f8d063b2e2137380e5d Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Fri, 16 Jan 2026 21:37:39 -0800
Subject: [PATCH 03/18] =?UTF-8?q?=F0=9F=A7=B0=20chore(ci):=20switch=20to?=
 =?UTF-8?q?=20GitHub=20Actions=20and=20add=20lint=20hooks?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .flake8                  | 12 ++++++++++++
 .github/workflows/ci.yml | 34 ++++++++++++++++++++++++++++++++++
 .husky/pre-commit        |  4 ++++
 .travis.yml              | 11 -----------
 package.json             | 10 ++++++++++
 5 files changed, 60 insertions(+), 11 deletions(-)
 create mode 100644 .flake8
 create mode 100644 .github/workflows/ci.yml
 create mode 100755 .husky/pre-commit
 delete mode 100644 .travis.yml
 create mode 100644 package.json

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..aac8e09
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,12 @@
+[flake8]
+max-line-length = 120
+max-complexity = 12
+select = E,F,W,C90
+extend-ignore = F403,F405
+exclude =
+    .git,
+    __pycache__,
+    venv,
+    build,
+    dist,
+    sdiff.egg-info
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..484ab83
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,34 @@
+name: CI
+
+on:
+  workflow_dispatch:
+  pull_request:
+    types: [opened, synchronize, reopened, ready_for_review]
+  push:
+    branches: [master]
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install .[tests]
+
+      - name: Lint
+        run: python -m flake8 --config .flake8 sdiff tests
+
+      - name: Test
+        run: python -m coverage run -m pytest -s --durations=3 --durations-min=0.005
+
+      - name: Coverage report
+        run: python -m coverage report -m
diff --git a/.husky/pre-commit b/.husky/pre-commit
new file mode 100755
index 0000000..b011f88
--- /dev/null
+++ b/.husky/pre-commit
@@ -0,0 +1,4 @@
+#!/usr/bin/env sh
+. "$(dirname -- "$0")/_/husky.sh"
+
+python -m flake8 --config .flake8 sdiff tests
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index df31221..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-language: python
-dist: jammy
-python:
-  - "3.11"
-# command to install dependencies
-install:
-  - make dev
-# command to run tests
-script:
-  - make test
-  - make coverage
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..d682872
--- /dev/null
+++ b/package.json
@@ -0,0 +1,10 @@
+{
+  "name": "html-structure-diff",
+  "private": true,
+  "devDependencies": {
+    "husky": "^9.0.0"
+  },
+  "scripts": {
+    "prepare": "husky install"
+  }
+}

From 32f9a779da5ee46305ea10c89dff9334680e8603 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Fri, 16 Jan 2026 21:38:18 -0800
Subject: [PATCH 04/18] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix(parser):=20up?=
 =?UTF-8?q?date=20mistune=203=20parsing=20and=20tests?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt         |   2 +-
 sdiff/__init__.py        |   5 +-
 sdiff/compare.py         |   4 +-
 sdiff/parser.py          | 532 +++++++++++++++++++++++++--------------
 setup.py                 |   2 +-
 tests/test_compare.py    |  12 +-
 tests/test_parser.py     |  49 ++++
 tests/test_sdiff.py      |  42 +++-
 tests/test_tree_utils.py |  59 ++++-
 9 files changed, 502 insertions(+), 205 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c9e10bf..a234623 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-mistune==2.0.3
+mistune==3.2.0
diff --git a/sdiff/__init__.py b/sdiff/__init__.py
index 853d12c..17319a9 100644
--- a/sdiff/__init__.py
+++ b/sdiff/__init__.py
@@ -8,9 +8,8 @@ def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParse
     tree2 = parse(md2, parser_cls)
 
     tree1, tree2, struct_errors = diff_struct(tree1, tree2)
-    # tree1, tree2, links_errors = diff_links(tree1, tree2)
+    tree1, tree2, links_errors = diff_links(tree1, tree2)
 
-    # errors = struct_errors + links_errors
-    errors = struct_errors
+    errors = struct_errors + links_errors
 
     return renderer.render(tree1), renderer.render(tree2), errors
diff --git a/sdiff/compare.py b/sdiff/compare.py
index 5958ada..5d4d19f 100644
--- a/sdiff/compare.py
+++ b/sdiff/compare.py
@@ -44,7 +44,9 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None):
 
 
 def diff_links(tree1, tree2):
-    return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a'])
+    tree1, tree2, errors = _diff(tree1, tree2, exclude_symbols=['t', 'i'])
+    link_errors = [error for error in errors if error.node.symbol == 'a']
+    return tree1, tree2, link_errors
 
 
 def diff_struct(tree1, tree2):
diff --git a/sdiff/parser.py b/sdiff/parser.py
index 93a4736..ad59b93 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -1,207 +1,351 @@
-from re import Match
-
-import mistune
 import re
+import textwrap
+from typing import Iterable
 
-from .model import *
-
-
-class InlineLexer(mistune.BlockLexer):
-    grammar_class = mistune.InlineGrammar
-
-    default_rules = [
-        'linebreak', 'link',
-        'reflink', 'text',
-    ]
-
-    def __init__(self):
-        self.links = {}
-        self.grammar_class.text = re.compile(r'^ {1,}\n|^[\s\S]+?(?=[\[`~]| {2,}\n|$)')
-        super().__init__()
-
-    def parse_autolink(self, m):
-        self.tokens.append(Link(m.group(0)))
-
-    def parse_url(self, m):
-        self.tokens.append(Link(m.group(0)))
-
-    def parse_link(self, m):
-        return self._process_link(m)
-
-    def parse_reflink(self, m):
-        # TODO skip this check for now
-        # key = mistune._keyify(m.group(2) or m.group(1))
-        # if key not in self.links:
-        #     return None
-        # ret = self.links[key]
-        return self._process_link(m)
-
-    def _process_link(self, m):
-        line = m.group(0)
-        if line[0] == '!':
-            node = Image(line)
-        else:
-            node = Link(line)
-
-        self.tokens.append(node)
-
-    def parse_linebreak(self, m):
-        node = NewLine()
-        self.tokens.append(node)
+import mistune
+from mistune import block_parser
 
-    def parse_text(self, m):
-        text = m.group(0)
-        if text.strip():
-            escaped_text = mistune.escape(text)
-            node = Text(escaped_text)
-            self.tokens.append(node)
+from .model import (Html, Image, Link, List, ListItem, NewLine, Paragraph, Root,
+                    Text, Header, ZendeskHelpCallout, ZendeskHelpSteps,
+                    ZendeskHelpTabs)
 
+_BLOCK_TAGS = {tag.lower() for tag in block_parser.BLOCK_TAGS}
+_HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)')
+_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\[[^\]]+\]')
+_REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+')
+_FENCE_RE = re.compile(r'^\s*(```|~~~)')
 
-class MdParser(mistune.BlockLexer):
-    default_rules = [
-        'newline', 'list_block', 'block_html',
-        'heading', 'lheading',
-        'paragraph', 'text',
-    ]
-
-    list_rules = (
-        'newline', 'heading', 'lheading',
-        'hrule', 'list_block', 'text',
-    )
 
+class MdParser:
     @classmethod
     def get_lexer(cls):
         return cls()
 
     def __init__(self):
-        super().__init__()
-        self.grammar_class.block_html = re.compile(
-            r'^\s* *(?:{}|{}|{}) *(?:\n{{1,}}|\s*$)'.format(
-                r'<!--[\s\S]*?-->',
-                r'<({})((?:{})*?)>([\s\S]+?)<\/\1>'.format(mistune._block_tag, mistune._valid_attr),
-                r'<{}(?:{})*?>'.format(mistune._block_tag, mistune._valid_attr),
-            )
-        )
-
-    def _parse_inline(self, text):
-        inline = InlineLexer()
-        return inline.parse(text)
-
-    def parse_newline(self, m):
-        length = len(m.group(0))
-        if length > 1:
-            self.tokens.append(NewLine())
-
-    def parse_heading(self, m):
-        level = len(m.group(1))
-        node = Header(level)
-        node.add_nodes(self._parse_inline(m.group(2)))
-        self.tokens.append(node)
-
-    def parse_lheading(self, m):
-        level = 1 if m.group(2) == '=' else 2
-        text = m.group(1)
-        node = Header(level)
-        node.add_nodes(self._parse_inline(text))
-        self.tokens.append(node)
-
-    def parse_block_html(self, m):
-        text = m.group(0)
-        html = Html(text)
-        self.tokens.append(html)
-
-    def parse_paragraph(self, m):
-        text = m.group(1).rstrip('\n')
-        node = Paragraph()
-        node.add_nodes(self._parse_inline(text))
-        self.tokens.append(node)
-
-    def parse_text(self, m):
-        text = m.group(0)
-        escaped_text = mistune.escape(text)
-        node = Text(escaped_text)
-        self.tokens.append(node)
-
-    def parse_list_block(self, m):
-        bull = m.group(2)
-        cap = m.group(0)
-        ordered = '.' in bull
-        node = List(ordered)
-        node.add_nodes(self._process_list_item(cap, bull))
-        self.tokens.append(node)
-
-    def _process_list_item(self, cap, bull):
-        result = []
-        cap = self.rules.list_item.findall(cap)
-
-        _next = False
-        length = len(cap)
-
-        for i in range(length):
-            item = cap[i][0]
-
-            # remove the bullet
-            space = len(item)
-            item = self.rules.list_bullet.sub('', item)
-
-            # outdent
-            if '\n ' in item:
-                space = space - len(item)
-                pattern = re.compile(r'^ {1,%d}' % space, flags=re.M)
-                item = pattern.sub('', item)
-
-            # determine whether item is loose or not
-            loose = _next
-            if not loose and re.search(r'\n\n(?!\s*$)', item):
-                loose = True
-
-            rest = len(item)
-            if i != length - 1 and rest:
-                _next = item[rest - 1] == '\n'
-                if not loose:
-                    loose = _next
-
-            node = ListItem()
-            block_lexer = self.get_lexer()
-            nodes = block_lexer.parse(item, self.list_rules)
-            node.add_nodes(nodes)
-            result.append(node)
-        return result
+        self._markdown = mistune.create_markdown(renderer='ast')
+        self._reference_definitions = {}
+
+    def parse(self, text):
+        tokens = self._markdown(text)
+        return Root(self._convert_block_tokens(tokens))
+
+    def _set_reference_definitions(self, definitions):
+        self._reference_definitions = definitions
+
+    def _convert_block_tokens(self, tokens: Iterable[dict]):
+        nodes = []
+        for token in tokens:
+            nodes.extend(self._convert_block_token(token))
+        return nodes
+
+    def _convert_block_token(self, token):
+        token_type = token.get('type')
+        if token_type == 'paragraph':
+            return [self._convert_paragraph_or_heading(token.get('children', []))]
+        if token_type == 'heading':
+            return [self._convert_heading(token)]
+        if token_type == 'list':
+            return [self._convert_list(token)]
+        if token_type == 'list_item':
+            return [self._convert_list_item(token)]
+        if token_type == 'block_text':
+            return [self._convert_paragraph_or_heading(token.get('children', []))]
+        if token_type == 'block_html':
+            return self._convert_block_html(token)
+        if token_type in {'thematic_break', 'block_quote', 'block_code', 'fenced_code'}:
+            return self._convert_passthrough_block(token)
+        return self._convert_passthrough_block(token)
+
+    def _convert_heading(self, token):
+        level = token.get('level') or token.get('attrs', {}).get('level', 1)
+        header = Header(level)
+        header.add_nodes(self._convert_inline_tokens(token.get('children', [])))
+        return header
+
+    def _convert_list(self, token):
+        ordered = token.get('ordered')
+        if ordered is None:
+            ordered = token.get('attrs', {}).get('ordered', False)
+        list_node = List(bool(ordered))
+        for item in token.get('children', []):
+            list_node.add_node(self._convert_list_item(item))
+        return list_node
+
+    def _convert_block_html(self, token):
+        raw = token.get('raw', '')
+        if _is_block_html(raw):
+            return [Html(raw)]
+        text = mistune.escape(raw)
+        if text.strip():
+            return [Paragraph([Text(text)])]
+        return []
+
+    def _convert_passthrough_block(self, token):
+        child_nodes = self._convert_block_tokens(token.get('children', []))
+        if child_nodes:
+            return child_nodes
+        raw = token.get('raw') or token.get('text') or ''
+        if raw.strip():
+            return [Paragraph([Text(mistune.escape(raw))])]
+        return []
+
+    def _convert_list_item(self, token):
+        item = ListItem()
+        for child in token.get('children', []):
+            child_type = child.get('type')
+            if child_type in {'block_text', 'paragraph'}:
+                item.add_nodes(self._convert_list_block_nodes(child.get('children', [])))
+            else:
+                item.add_nodes(self._convert_block_tokens([child]))
+        return item
+
+    def _convert_inline_tokens(self, tokens: Iterable[dict]):
+        nodes = []
+        buffer = ''
+
+        def flush_buffer():
+            nonlocal buffer
+            if buffer:
+                self._split_reference_links(buffer, nodes)
+                buffer = ''
+
+        for token in tokens:
+            token_type = token.get('type')
+            if token_type in {'text', 'inline_html', 'block_html'}:
+                buffer += token.get('raw', '')
+            elif token_type == 'codespan':
+                buffer += f"`{token.get('raw') or token.get('text') or ''}`"
+            elif token_type == 'softbreak':
+                buffer += ' '
+            elif token_type == 'linebreak':
+                flush_buffer()
+                nodes.append(NewLine())
+            elif token_type == 'link':
+                flush_buffer()
+                text = self._flatten_inline_text(token.get('children', []))
+                url = token.get('attrs', {}).get('url', '')
+                nodes.append(Link(f"[{text}]({url})"))
+            elif token_type == 'image':
+                flush_buffer()
+                alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
+                url = token.get('attrs', {}).get('url', '')
+                nodes.append(Image(f"![{alt}]({url})"))
+            else:
+                flush_buffer()
+                children = token.get('children', [])
+                if children:
+                    nodes.extend(self._convert_inline_tokens(children))
+                else:
+                    raw = token.get('raw') or token.get('text') or ''
+                    if raw.strip():
+                        _append_text(nodes, mistune.escape(raw))
+
+        flush_buffer()
+        return nodes
+
+    def _flatten_inline_text(self, tokens: Iterable[dict]):
+        parts = []
+        for token in tokens:
+            token_type = token.get('type')
+            if token_type in {'text', 'inline_html', 'block_html'}:
+                parts.append(token.get('raw') or token.get('text') or '')
+            elif token_type == 'codespan':
+                parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
+            elif token_type in {'linebreak', 'softbreak'}:
+                parts.append(' ')
+            else:
+                children = token.get('children', [])
+                if children:
+                    parts.append(self._flatten_inline_text(children))
+                else:
+                    parts.append(token.get('raw') or token.get('text') or '')
+        return ''.join(parts).strip()
+
+    def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]):
+        ref_text = self._reference_definition_text(inline_tokens)
+        if ref_text is not None:
+            return Paragraph([Text(ref_text)])
+        heading = self._heading_from_inline(inline_tokens)
+        if heading:
+            return heading
+        return Paragraph(self._convert_inline_tokens(inline_tokens))
+
+    def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]):
+        heading = self._heading_from_inline(inline_tokens)
+        if heading:
+            return [heading]
+        return self._convert_inline_tokens(inline_tokens)
+
+    def _heading_from_inline(self, inline_tokens: Iterable[dict]):
+        if len(inline_tokens) != 1:
+            return None
+        token = inline_tokens[0]
+        if token.get('type') != 'text':
+            return None
+        raw = token.get('raw', '')
+        match = _HEADING_LINE_RE.match(raw)
+        if not match:
+            return None
+        level = len(match.group(2))
+        content = raw[match.end(2):].lstrip()
+        heading_tokens = self._markdown(f"{'#' * level} {content}")
+        if heading_tokens and heading_tokens[0].get('type') == 'heading':
+            children = heading_tokens[0].get('children', [])
+        else:
+            children = [{'type': 'text', 'raw': content}]
+        header = Header(level)
+        header.add_nodes(self._convert_inline_tokens(children))
+        return header
+
+    def _reference_definition_text(self, inline_tokens: Iterable[dict]):
+        if len(inline_tokens) != 1:
+            return None
+        token = inline_tokens[0]
+        if token.get('type') != 'text':
+            return None
+        raw = token.get('raw', '')
+        return self._reference_definitions.get(raw)
+
+    def _split_reference_links(self, raw: str, nodes):
+        last = 0
+        for match in _REF_LINK_OR_IMAGE_RE.finditer(raw):
+            if match.start() > last:
+                _append_text(nodes, mistune.escape(raw[last:match.start()]))
+            snippet = match.group(0)
+            if snippet.startswith('!['):
+                nodes.append(Image(snippet))
+            else:
+                nodes.append(Link(snippet))
+            last = match.end()
+        if last < len(raw):
+            _append_text(nodes, mistune.escape(raw[last:]))
+        return nodes
 
 
 class ZendeskHelpMdParser(MdParser):
-    TAG_CONTENT_GROUP = 'tag_content'
-    TAG_PATTERN = r'^\s*(<{tag_name}{attr_re}>(?P<%s>[\s\S]+?)</{tag_name}>)\s*$' % TAG_CONTENT_GROUP
-    CALLOUT_STYLE_GROUP = 'style'
-    CALLOUT_ATTR_PATTERN = r'( (?P<%s>green|red|yellow))*' % CALLOUT_STYLE_GROUP
-
-    def __init__(self):
-        super().__init__()
-        self.grammar_class.callout = re.compile(self.TAG_PATTERN.format(tag_name='callout',
-                                                                        attr_re=self.CALLOUT_ATTR_PATTERN))
-        self.default_rules.insert(0, 'callout')
-
-        self.grammar_class.steps = re.compile(self.TAG_PATTERN.format(tag_name='steps', attr_re=''))
-        self.default_rules.insert(0, 'steps')
-
-        self.grammar_class.tabs = re.compile(self.TAG_PATTERN.format(tag_name='tabs', attr_re=''))
-        self.default_rules.insert(0, 'tabs')
-
-    def parse_callout(self, m: Match[str]) -> None:
-        style = m.group(self.CALLOUT_STYLE_GROUP)
-        self._parse_nested(ZendeskHelpCallout(style), m)
-
-    def parse_steps(self, m: Match[str]) -> None:
-        self._parse_nested(ZendeskHelpSteps(), m)
-
-    def parse_tabs(self, m: Match[str]) -> None:
-        self._parse_nested(ZendeskHelpTabs(), m)
-
-    def _parse_nested(self, node: Node, m: Match[str]) -> None:
-        nested_content = m.group(self.TAG_CONTENT_GROUP)
-        nested_nodes = self.get_lexer().parse(nested_content)
-        node.add_nodes(nested_nodes)
-        self.tokens.append(node)
+    _CALLOUT_PATTERN = re.compile(
+        r'(?s)<callout(?:\s+(?P<style>green|red|yellow))?>(?P<content>.*?)</callout>'
+    )
+    _STEPS_PATTERN = re.compile(r'(?s)<steps>(?P<content>.*?)</steps>')
+    _TABS_PATTERN = re.compile(r'(?s)<tabs>(?P<content>.*?)</tabs>')
+
+    def parse(self, text):
+        nodes = self._parse_nodes(text)
+        return Root(nodes)
+
+    def _parse_nodes(self, text: str):
+        nodes = []
+        remaining = text
+        while remaining:
+            tag_name, match = self._find_next_tag(remaining)
+            if not match:
+                nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining)))
+                break
+
+            if match.start() > 0:
+                prefix = remaining[:match.start()]
+                nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix)))
+
+            content = match.group('content')
+            if tag_name == 'callout':
+                node = ZendeskHelpCallout(match.group('style'))
+            elif tag_name == 'steps':
+                node = ZendeskHelpSteps()
+            else:
+                node = ZendeskHelpTabs()
+
+            node.add_nodes(self._parse_nodes(content))
+            nodes.append(node)
+
+            remaining = remaining[match.end():]
+        return nodes
+
+    def _find_next_tag(self, text: str):
+        matches = []
+        for name, pattern in (
+            ('callout', self._CALLOUT_PATTERN),
+            ('steps', self._STEPS_PATTERN),
+            ('tabs', self._TABS_PATTERN),
+        ):
+            match = pattern.search(text)
+            if match:
+                matches.append((match.start(), name, match))
+        if not matches:
+            return None, None
+        _, name, match = min(matches, key=lambda item: item[0])
+        return name, match
+
+    def _parse_markdown(self, text: str):
+        normalized = _remove_spaces_from_empty_lines(text)
+        normalized = _remove_ltr_rtl_marks(normalized)
+        return self._convert_block_tokens(self._markdown(normalized))
+
+
+def _append_text(nodes, text):
+    if not text:
+        return
+    if nodes and isinstance(nodes[-1], Text):
+        nodes[-1].text += text
+    else:
+        nodes.append(Text(text))
+
+
+def _is_block_html(raw: str) -> bool:
+    stripped = raw.lstrip()
+    if stripped.startswith('<!--'):
+        return True
+    match = re.match(r'<\/?\s*([a-zA-Z0-9]+)', stripped)
+    if not match:
+        return False
+    return match.group(1).lower() in _BLOCK_TAGS
+
+
+def _normalize_block_indentation(text: str) -> str:
+    dedented = textwrap.dedent(text)
+    lines = dedented.splitlines()
+    indents = []
+    for line in lines:
+        if not line.strip():
+            continue
+        stripped = line.lstrip()
+        if stripped.startswith('<'):
+            continue
+        indent = len(line) - len(stripped)
+        indents.append(indent)
+    if indents:
+        min_indent = min(indents)
+        if min_indent:
+            lines = [line[min_indent:] if len(line) >= min_indent else line for line in lines]
+    return '\n'.join(lines).strip()
+
+
+def _extract_reference_definitions(text: str):
+    lines = text.splitlines()
+    output = []
+    definitions = {}
+    fence = None
+    counter = 0
+    for line in lines:
+        fence_match = _FENCE_RE.match(line)
+        if fence_match:
+            marker = fence_match.group(1)
+            if fence is None:
+                fence = marker
+            elif fence == marker:
+                fence = None
+            output.append(line)
+            continue
+
+        if fence is None and _REF_DEF_LINE_RE.match(line):
+            placeholder = f"SDIFF_REF_DEF_{counter}"
+            counter += 1
+            definitions[placeholder] = line.strip()
+            output.append(placeholder)
+            continue
+
+        output.append(line)
+
+    return '\n'.join(output), definitions
 
 
 def _remove_spaces_from_empty_lines(text):
@@ -213,8 +357,10 @@ def _remove_ltr_rtl_marks(text):
 
 
 def parse(text, parser_cls: type[MdParser] = MdParser):
-    # HACK dirty hack to be consistent with Markdown list_block
     text = _remove_spaces_from_empty_lines(text)
     text = _remove_ltr_rtl_marks(text)
-    block_lexer = parser_cls()
-    return Root(block_lexer.parse(text))
+    text, reference_definitions = _extract_reference_definitions(text)
+    parser = parser_cls()
+    if hasattr(parser, '_set_reference_definitions'):
+        parser._set_reference_definitions(reference_definitions)
+    return parser.parse(text)
diff --git a/setup.py b/setup.py
index f8d6680..ec6617a 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ def read(f):
 
 
 install_requires = [
-    'mistune < 3',
+    'mistune==3.2.0',
 ]
 
 tests_require = [
diff --git a/tests/test_compare.py b/tests/test_compare.py
index f083a2f..496c15d 100644
--- a/tests/test_compare.py
+++ b/tests/test_compare.py
@@ -1,7 +1,7 @@
 from unittest import TestCase
 
 from sdiff.compare import diff_links, diff_struct
-from sdiff.model import List
+from sdiff.model import List, Link, Paragraph, Root
 from .fixtures import trees
 
 
@@ -38,8 +38,10 @@ def test_header_in_list(self):
         _, _, errors = diff_struct(trees.lm2tm2t(), trees.lm2tm2t())
         self.assertEqual([], errors)
 
-    def test_concatenate_text_nodes_when_element_in_middle_ignored(self):
-        _, _, errors = diff_struct(trees.ptat(), trees.pt())
+    def test_link_content_ignored(self):
+        left = Root([Paragraph([Link('left link')])])
+        right = Root([Paragraph([Link('right link')])])
+        _, _, errors = diff_struct(left, right)
         self.assertEqual([], errors)
 
 
@@ -76,3 +78,7 @@ def test_different_lists(self):
             actual = errors[1].node
             self.assertEqual(actual, List(ordered=True))
             self.assertEqual(actual.meta.get('style'), 'ins')
+
+    def test_missing_link(self):
+        _, _, errors = diff_links(trees.ptat(), trees.pt())
+        self.assertTrue(any(error.node.name == 'link' for error in errors))
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 498c070..e88839b 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -28,9 +28,13 @@ def test_header_in_list(self):
 
     def test_link(self):
         self._run_and_assert('[link](url)', 'pa')
+        actual = self._parse('[link](url)')
+        self.assertEqual('[link](url)', actual.nodes[0].nodes[0].text)
 
     def test_image(self):
         self._run_and_assert('![Alt text][url/to/image]', 'pi')
+        actual = self._parse('![Alt text][url/to/image]')
+        self.assertEqual('![Alt text][url/to/image]', actual.nodes[0].nodes[0].text)
 
     def test_broken_link_space(self):
         self._run_and_assert('[link] (http://www.google.com)', 'pt')
@@ -68,6 +72,39 @@ def test_heading_text(self):
     def test_link_wrapped_in_text(self):
         self._run_and_assert('some text [link](url) new text', 'ptat')
 
+    def test_link_label_with_codespan(self):
+        actual = self._parse('[use `foo`](url)')
+        self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text)
+
+    def test_reference_definition_preserved(self):
+        data = 'See [API][id].\n\n[id]: https://example.com'
+        tree = self._parse(data)
+        link = next(node for node in tree.nodes[0].nodes if node.name == 'link')
+        self.assertEqual('[API][id]', link.text)
+        self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text)
+
+    def test_reference_definition_inside_fence_is_text(self):
+        data = """```
+[id]: https://example.com
+[link][id]
+```"""
+        tree = self._parse(data)
+        self.assertEqual('pt', tree.print_all())
+
+    def test_softbreak_preserves_space(self):
+        actual = self._parse('hello\nworld')
+        self.assertEqual('hello world', actual.nodes[0].nodes[0].text)
+
+    def test_ordered_list_parses_as_ordered(self):
+        tree = self._parse('1. one\n2. two')
+        list_node = tree.nodes[0]
+        self.assertTrue(list_node.ordered)
+
+    def test_unordered_list_parses_as_unordered(self):
+        tree = self._parse('- one\n- two')
+        list_node = tree.nodes[0]
+        self.assertFalse(list_node.ordered)
+
 
 class TestZendeskParser(ParserTestCase):
     def setUp(self) -> None:
@@ -114,6 +151,13 @@ def test_tabs(self):
         """
         self._run_and_assert(fixture, 'T1tpt1tpt')
 
+    def test_inline_callout_is_structural(self):
+        fixture = """intro <callout>
+# title
+content
+</callout> outro"""
+        self._run_and_assert(fixture, 'ptC1tptpt')
+
     def test_steps(self):
         steps_fixture = """
         <steps>
@@ -166,3 +210,8 @@ def test_leave_spaces_with_text(self):
         text = 'test  \n  test'
         actual = parser._remove_spaces_from_empty_lines(text)
         self.assertEqual(text, actual)
+
+    def test_remove_ltr_rtl_marks(self):
+        text = 'a\u200eb\u200f'
+        actual = parser._remove_ltr_rtl_marks(text)
+        self.assertEqual('ab', actual)
diff --git a/tests/test_sdiff.py b/tests/test_sdiff.py
index a132509..3783a8a 100644
--- a/tests/test_sdiff.py
+++ b/tests/test_sdiff.py
@@ -8,7 +8,7 @@
 
 
 def _load_fixture(*path):
-    return open(os.path.join('tests/fixtures', *path)).read()
+    return open(os.path.join('tests/fixtures', *path), encoding='utf-8').read()
 
 
 def _read_test_files(dirpath):
@@ -36,3 +36,43 @@ def test_different(self):
                 _, _, errors = sdiff.diff(_load_fixture('different', path1), _load_fixture('different', path2),
                                           parser_cls=ZendeskHelpMdParser)
                 self.assertNotEqual([], errors, msg=case)
+
+    def test_ignores_link_content(self):
+        left = '[Link](http://example.com)'
+        right = '[Different](http://example.org)'
+        _, _, errors = sdiff.diff(left, right)
+        self.assertEqual([], errors)
+
+    def test_missing_link_is_reported(self):
+        left = 'text [Link](http://example.com)'
+        right = 'text'
+        _, _, errors = sdiff.diff(left, right)
+        self.assertTrue(any(error.node.name == 'link' for error in errors))
+
+    def test_extra_paragraph_has_paragraph_error(self):
+        left = _load_fixture('different', 'extra_paragraph.en.md')
+        right = _load_fixture('different', 'extra_paragraph.de.md')
+        _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser)
+        self.assertTrue(any(error.node.name == 'paragraph' for error in errors))
+
+    def test_softbreaks_ignored_in_structure(self):
+        left = 'hello\nworld'
+        right = 'hello world'
+        _, _, errors = sdiff.diff(left, right)
+        self.assertEqual([], errors)
+
+    def test_reference_definition_missing_is_reported(self):
+        left = 'See [API][id].\n\n[id]: https://example.com'
+        right = 'See [API][id].'
+        _, _, errors = sdiff.diff(left, right)
+        self.assertTrue(any(error.node.name == 'paragraph' for error in errors))
+
+    def test_code_block_content_ignored_in_structure(self):
+        left = """```
+code sample
+```"""
+        right = """```
+different code sample
+```"""
+        _, _, errors = sdiff.diff(left, right)
+        self.assertEqual([], errors)
diff --git a/tests/test_tree_utils.py b/tests/test_tree_utils.py
index ff8a226..dc4f0ab 100644
--- a/tests/test_tree_utils.py
+++ b/tests/test_tree_utils.py
@@ -1,7 +1,62 @@
 from unittest import TestCase
 
+from sdiff.model import Header, Link, Paragraph, Root, Text
+from sdiff.tree_utils import traverse
+
 
 class TestTraverse(TestCase):
 
-    def test_name(self):
-        pass
+    def test_preorder_traversal(self):
+        tree = Root([
+            Paragraph([
+                Text('one'),
+                Link('link'),
+            ]),
+            Header(2, [
+                Text('heading'),
+            ]),
+        ])
+        symbols = [node.symbol for node in traverse(tree)]
+        self.assertEqual(['p', 't', 'a', 'h', 't'], symbols)
+
+    def test_consecutive_text_nodes_coalesced(self):
+        tree = Root([
+            Paragraph([
+                Text('one'),
+                Text('two'),
+                Link('link'),
+                Text('three'),
+                Text('four'),
+            ]),
+        ])
+        texts = [node.text for node in traverse(tree) if isinstance(node, Text)]
+        self.assertEqual(['one', 'three'], texts)
+
+    def test_exclude_symbols_prunes_children(self):
+        tree = Root([
+            Paragraph([
+                Text('one'),
+                Link('link'),
+            ]),
+        ])
+        symbols = [node.symbol for node in traverse(tree, exclude_symbols=['a'])]
+        self.assertEqual(['p', 't'], symbols)
+
+    def test_include_symbols_filters_children(self):
+        tree = Root([
+            Paragraph([
+                Text('one'),
+                Link('link'),
+            ]),
+        ])
+        symbols = [node.symbol for node in traverse(tree, include_symbols=['a'])]
+        self.assertEqual(['p', 'a'], symbols)
+
+    def test_include_exclude_conflict_excludes(self):
+        tree = Root([
+            Paragraph([
+                Link('link'),
+            ]),
+        ])
+        symbols = [node.symbol for node in traverse(tree, include_symbols=['a'], exclude_symbols=['a'])]
+        self.assertEqual(['p'], symbols)

From 73f3144002d6875fb7c2882832cc7428d716c9e9 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Mon, 19 Jan 2026 13:31:04 -0800
Subject: [PATCH 05/18] =?UTF-8?q?=F0=9F=A7=B0=20chore:=20enforce=20autopep?=
 =?UTF-8?q?8=20checks=20and=20clean=20setup?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .github/workflows/ci.yml |  3 +++
 .husky/pre-commit        |  1 +
 AGENTS.md                |  4 +++-
 Makefile                 | 15 ++++++++++++++-
 setup.py                 |  1 -
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 484ab83..bf9a1e8 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -24,6 +24,9 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install .[tests]
 
+      - name: Format check
+        run: python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
+
       - name: Lint
         run: python -m flake8 --config .flake8 sdiff tests
 
diff --git a/.husky/pre-commit b/.husky/pre-commit
index b011f88..bc7696e 100755
--- a/.husky/pre-commit
+++ b/.husky/pre-commit
@@ -1,4 +1,5 @@
 #!/usr/bin/env sh
 . "$(dirname -- "$0")/_/husky.sh"
 
+python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
 python -m flake8 --config .flake8 sdiff tests
diff --git a/AGENTS.md b/AGENTS.md
index bb38229..09070b5 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -8,9 +8,11 @@ The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tes
 - `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv.
 - `make test` runs linting and the full pytest suite with coverage.
 - `make vtest` runs pytest verbosely.
-- `make flake` runs flake8 on `sdiff/` and `tests/`.
+- `make flake` runs the autopep8 format check and flake8 on `sdiff/` and `tests/`.
+- `make format` applies autopep8 formatting to `sdiff/` and `tests/`.
 - `make cov` prints the coverage report.
 - `make clean` removes build artifacts and the venv.
+- `make hooks` installs Husky git hooks (requires Node/npm; `make dev` runs this).
 
 Example flow:
 ```sh
diff --git a/Makefile b/Makefile
index 6eeb1e2..4be00c9 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ env:
 
 dev: env update
 	$(PIP) install .[tests,devtools]
+	@$(MAKE) hooks
 
 install: env update
 
@@ -28,8 +29,20 @@ publish:
 	$(TWINE) upload --verbose --sign --username developer --repository-url http://$(PYPICLOUD_HOST)/simple/ dist/*.whl
 
 flake:
+	$(PYTHON) -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
 	$(FLAKE) sdiff tests
 
+format:
+	$(PYTHON) -m autopep8 --in-place --max-line-length 120 -r sdiff tests
+
+hooks:
+	@if command -v npm >/dev/null 2>&1; then \
+		npm install --no-package-lock --silent; \
+		npm run --silent prepare; \
+	else \
+		echo "npm not found; skipping husky install"; \
+	fi
+
 test: flake
 	$(COVERAGE) run -m pytest $(TEST_RUNNER_FLAGS)
 
@@ -57,4 +70,4 @@ clean:
 	rm -rf venv
 
 
-.PHONY: all build env linux run pep test vtest testloop cov clean
+.PHONY: all build env linux run pep test vtest testloop cov clean hooks format
diff --git a/setup.py b/setup.py
index ec6617a..d7dbf6a 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,6 @@ def read(f):
       package_data={},
       namespace_packages=[],
       install_requires=install_requires,
-      tests_require=tests_require,
       extras_require={
           'tests': tests_require,
           'devtools': devtools_require,

From 2c8cc22ee94d5e26a42389d81a0e47e73834297a Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Mon, 19 Jan 2026 14:11:56 -0800
Subject: [PATCH 06/18] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix(parser):=20pr?=
 =?UTF-8?q?eserve=20link=20titles=20and=20wrap=20list=20parsers?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/parser.py      | 32 +++++++++++++++++++++++++++-----
 tests/test_parser.py | 22 +++++++++++++++++++++-
 2 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/sdiff/parser.py b/sdiff/parser.py
index ad59b93..7b9fbe3 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -123,13 +123,17 @@ def flush_buffer():
             elif token_type == 'link':
                 flush_buffer()
                 text = self._flatten_inline_text(token.get('children', []))
-                url = token.get('attrs', {}).get('url', '')
-                nodes.append(Link(f"[{text}]({url})"))
+                attrs = token.get('attrs', {})
+                url = attrs.get('url', '')
+                title = attrs.get('title')
+                nodes.append(Link(_format_link_markup(text, url, title)))
             elif token_type == 'image':
                 flush_buffer()
                 alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
-                url = token.get('attrs', {}).get('url', '')
-                nodes.append(Image(f"![{alt}]({url})"))
+                attrs = token.get('attrs', {})
+                url = attrs.get('url', '')
+                title = attrs.get('title')
+                nodes.append(Image(_format_image_markup(alt, url, title)))
             else:
                 flush_buffer()
                 children = token.get('children', [])
@@ -290,6 +294,21 @@ def _append_text(nodes, text):
         nodes.append(Text(text))
 
 
+def _format_title(title: str) -> str:
+    if title is None:
+        return ''
+    escaped = title.replace('"', '\\"')
+    return f' "{escaped}"'
+
+
+def _format_link_markup(text: str, url: str, title: str | None) -> str:
+    return f'[{text}]({url}{_format_title(title)})'
+
+
+def _format_image_markup(alt: str, url: str, title: str | None) -> str:
+    return f'![{alt}]({url}{_format_title(title)})'
+
+
 def _is_block_html(raw: str) -> bool:
     stripped = raw.lstrip()
     if stripped.startswith('<!--'):
@@ -363,4 +382,7 @@ def parse(text, parser_cls: type[MdParser] = MdParser):
     parser = parser_cls()
     if hasattr(parser, '_set_reference_definitions'):
         parser._set_reference_definitions(reference_definitions)
-    return parser.parse(text)
+    result = parser.parse(text)
+    if isinstance(result, list):
+        return Root(result)
+    return result
diff --git a/tests/test_parser.py b/tests/test_parser.py
index e88839b..58aaab0 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,6 +1,6 @@
 from unittest import TestCase
 from sdiff import parser, MdParser, ZendeskHelpMdParser
-from sdiff.model import ZendeskHelpSteps
+from sdiff.model import Paragraph, Root, Text, ZendeskHelpSteps
 
 
 class ParserTestCase(TestCase):
@@ -76,6 +76,14 @@ def test_link_label_with_codespan(self):
         actual = self._parse('[use `foo`](url)')
         self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text)
 
+    def test_link_title_preserved(self):
+        actual = self._parse('[label](https://example.com "Title Here")')
+        self.assertEqual('[label](https://example.com "Title Here")', actual.nodes[0].nodes[0].text)
+
+    def test_image_title_preserved(self):
+        actual = self._parse('![alt](https://img "Img Title")')
+        self.assertEqual('![alt](https://img "Img Title")', actual.nodes[0].nodes[0].text)
+
     def test_reference_definition_preserved(self):
         data = 'See [API][id].\n\n[id]: https://example.com'
         tree = self._parse(data)
@@ -215,3 +223,15 @@ def test_remove_ltr_rtl_marks(self):
         text = 'a\u200eb\u200f'
         actual = parser._remove_ltr_rtl_marks(text)
         self.assertEqual('ab', actual)
+
+
+class DummyParser:
+    def parse(self, text):
+        return [Paragraph([Text(text)])]
+
+
+class TestParseWrapper(TestCase):
+    def test_wraps_list_parser_output(self):
+        tree = parser.parse('hello', parser_cls=DummyParser)
+        self.assertIsInstance(tree, Root)
+        self.assertEqual('pt', tree.print_all())

From 81a6cd634134a2945cc8c985ba50584dfb998fae Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Tue, 20 Jan 2026 14:48:49 -0800
Subject: [PATCH 07/18] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix(parser):=20ke?=
 =?UTF-8?q?ep=20inline=20markers=20and=20ref=20link=20variants?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/parser.py      | 19 ++++++++++++++++++-
 tests/test_parser.py | 11 +++++++++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/sdiff/parser.py b/sdiff/parser.py
index 7b9fbe3..16c341e 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -11,9 +11,14 @@
 
 _BLOCK_TAGS = {tag.lower() for tag in block_parser.BLOCK_TAGS}
 _HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)')
-_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\[[^\]]+\]')
+_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]')
 _REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+')
 _FENCE_RE = re.compile(r'^\s*(```|~~~)')
+_INLINE_MARKERS = {
+    'strong': '**',
+    'emphasis': '*',
+    'strikethrough': '~~',
+}
 
 
 class MdParser:
@@ -134,6 +139,14 @@ def flush_buffer():
                 url = attrs.get('url', '')
                 title = attrs.get('title')
                 nodes.append(Image(_format_image_markup(alt, url, title)))
+            elif token_type in _INLINE_MARKERS:
+                flush_buffer()
+                marker = _INLINE_MARKERS[token_type]
+                _append_text(nodes, marker)
+                children = token.get('children', [])
+                if children:
+                    nodes.extend(self._convert_inline_tokens(children))
+                _append_text(nodes, marker)
             else:
                 flush_buffer()
                 children = token.get('children', [])
@@ -155,6 +168,10 @@ def _flatten_inline_text(self, tokens: Iterable[dict]):
                 parts.append(token.get('raw') or token.get('text') or '')
             elif token_type == 'codespan':
                 parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
+            elif token_type in _INLINE_MARKERS:
+                marker = _INLINE_MARKERS[token_type]
+                inner = self._flatten_inline_text(token.get('children', []))
+                parts.append(f'{marker}{inner}{marker}')
             elif token_type in {'linebreak', 'softbreak'}:
                 parts.append(' ')
             else:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 58aaab0..e8ebfdc 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -76,6 +76,10 @@ def test_link_label_with_codespan(self):
         actual = self._parse('[use `foo`](url)')
         self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text)
 
+    def test_link_label_with_strong_preserves_markers(self):
+        actual = self._parse('[**bold**](url)')
+        self.assertEqual('[**bold**](url)', actual.nodes[0].nodes[0].text)
+
     def test_link_title_preserved(self):
         actual = self._parse('[label](https://example.com "Title Here")')
         self.assertEqual('[label](https://example.com "Title Here")', actual.nodes[0].nodes[0].text)
@@ -91,6 +95,13 @@ def test_reference_definition_preserved(self):
         self.assertEqual('[API][id]', link.text)
         self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text)
 
+    def test_reference_links_with_whitespace_and_empty_id(self):
+        data = 'See [API][] and [Ref] [id].\n\n[API]: https://example.com\n[id]: https://example.com'
+        tree = self._parse(data)
+        link_texts = [node.text for node in tree.nodes[0].nodes if node.name == 'link']
+        self.assertIn('[API][]', link_texts)
+        self.assertIn('[Ref] [id]', link_texts)
+
     def test_reference_definition_inside_fence_is_text(self):
         data = """```
 [id]: https://example.com

From 983460c18e841a4af94215663431947e2bcb7c81 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Tue, 20 Jan 2026 15:31:00 -0800
Subject: [PATCH 08/18] =?UTF-8?q?=F0=9F=A7=AA=20test(parser):=20fence=20ha?=
 =?UTF-8?q?ndling=20for=20zendesk=20tags=20and=20ref=20defs?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/parser.py      | 40 +++++++++++++++++++++++++++++++++++-----
 tests/test_parser.py | 26 ++++++++++++++++++++++++++
 2 files changed, 61 insertions(+), 5 deletions(-)

diff --git a/sdiff/parser.py b/sdiff/parser.py
index 16c341e..dbeabbe 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -13,7 +13,7 @@
 _HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)')
 _REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]')
 _REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+')
-_FENCE_RE = re.compile(r'^\s*(```|~~~)')
+_FENCE_RE = re.compile(r'^\s*(`{3,}|~{3,})')
 _INLINE_MARKERS = {
     'strong': '**',
     'emphasis': '*',
@@ -293,8 +293,11 @@ def _find_next_tag(self, text: str):
                 matches.append((match.start(), name, match))
         if not matches:
             return None, None
-        _, name, match = min(matches, key=lambda item: item[0])
-        return name, match
+        matches.sort(key=lambda item: item[0])
+        for _, name, match in matches:
+            if not _is_inside_fenced_block(text, match.start()):
+                return name, match
+        return None, None
 
     def _parse_markdown(self, text: str):
         normalized = _remove_spaces_from_empty_lines(text)
@@ -360,15 +363,20 @@ def _extract_reference_definitions(text: str):
     output = []
     definitions = {}
     fence = None
+    fence_len = 0
     counter = 0
     for line in lines:
         fence_match = _FENCE_RE.match(line)
         if fence_match:
             marker = fence_match.group(1)
+            marker_len = len(marker)
+            marker_char = marker[0]
             if fence is None:
-                fence = marker
-            elif fence == marker:
+                fence = marker_char
+                fence_len = marker_len
+            elif marker_char == fence and marker_len >= fence_len:
                 fence = None
+                fence_len = 0
             output.append(line)
             continue
 
@@ -384,6 +392,28 @@ def _extract_reference_definitions(text: str):
     return '\n'.join(output), definitions
 
 
+def _is_inside_fenced_block(text: str, offset: int) -> bool:
+    fence = None
+    fence_len = 0
+    running = 0
+    for line in text.splitlines(True):
+        line_len = len(line)
+        if running + line_len > offset:
+            return fence is not None
+        fence_match = _FENCE_RE.match(line)
+        if fence_match:
+            marker = fence_match.group(1)
+            marker_len = len(marker)
+            if fence is None:
+                fence = marker[0]
+                fence_len = marker_len
+            elif marker[0] == fence and marker_len >= fence_len:
+                fence = None
+                fence_len = 0
+        running += line_len
+    return False
+
+
 def _remove_spaces_from_empty_lines(text):
     return '\n'.join([re.sub(r'^( {1,}|\t{1,})$', '\n', line) for line in text.splitlines()])
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index e8ebfdc..20f8410 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -110,6 +110,14 @@ def test_reference_definition_inside_fence_is_text(self):
         tree = self._parse(data)
         self.assertEqual('pt', tree.print_all())
 
+    def test_reference_definition_inside_long_fence_is_text(self):
+        data = """````
+[id]: https://example.com
+[link][id]
+````"""
+        tree = self._parse(data)
+        self.assertEqual('pt', tree.print_all())
+
     def test_softbreak_preserves_space(self):
         actual = self._parse('hello\nworld')
         self.assertEqual('hello world', actual.nodes[0].nodes[0].text)
@@ -177,6 +185,24 @@ def test_inline_callout_is_structural(self):
 </callout> outro"""
         self._run_and_assert(fixture, 'ptC1tptpt')
 
+    def test_zendesk_tags_inside_fenced_code_are_text(self):
+        fixture = """```
+<callout>
+# title
+content
+</callout>
+<steps>
+1. one
+</steps>
+<tabs>
+# tab
+content
+</tabs>
+```"""
+        tree = self._parse(fixture)
+        self.assertEqual('pt', tree.print_all())
+        self.assertFalse(any(node.name in {'callout', 'steps', 'tabs'} for node in tree.nodes))
+
     def test_steps(self):
         steps_fixture = """
         <steps>

From 08bf20489673e3d55f8e379aa74132fd6e84ad56 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Tue, 20 Jan 2026 18:19:47 -0800
Subject: [PATCH 09/18] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix(api):=20resto?=
 =?UTF-8?q?re=20diff=20semantics=20and=20parser=20compat?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/__init__.py     |  4 +---
 sdiff/compare.py      |  4 +---
 sdiff/parser.py       | 12 +++++++-----
 tests/test_compare.py |  4 ++++
 tests/test_parser.py  | 15 ++++++++++++++-
 tests/test_sdiff.py   |  4 +++-
 6 files changed, 30 insertions(+), 13 deletions(-)

diff --git a/sdiff/__init__.py b/sdiff/__init__.py
index 17319a9..cde6b1f 100644
--- a/sdiff/__init__.py
+++ b/sdiff/__init__.py
@@ -8,8 +8,6 @@ def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParse
     tree2 = parse(md2, parser_cls)
 
     tree1, tree2, struct_errors = diff_struct(tree1, tree2)
-    tree1, tree2, links_errors = diff_links(tree1, tree2)
-
-    errors = struct_errors + links_errors
+    errors = struct_errors
 
     return renderer.render(tree1), renderer.render(tree2), errors
diff --git a/sdiff/compare.py b/sdiff/compare.py
index 5d4d19f..5958ada 100644
--- a/sdiff/compare.py
+++ b/sdiff/compare.py
@@ -44,9 +44,7 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None):
 
 
 def diff_links(tree1, tree2):
-    tree1, tree2, errors = _diff(tree1, tree2, exclude_symbols=['t', 'i'])
-    link_errors = [error for error in errors if error.node.symbol == 'a']
-    return tree1, tree2, link_errors
+    return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a'])
 
 
 def diff_struct(tree1, tree2):
diff --git a/sdiff/parser.py b/sdiff/parser.py
index dbeabbe..6d06f81 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -22,6 +22,8 @@
 
 
 class MdParser:
+    list_rules = None
+
     @classmethod
     def get_lexer(cls):
         return cls()
@@ -30,9 +32,9 @@ def __init__(self):
         self._markdown = mistune.create_markdown(renderer='ast')
         self._reference_definitions = {}
 
-    def parse(self, text):
+    def parse(self, text, rules=None):
         tokens = self._markdown(text)
-        return Root(self._convert_block_tokens(tokens))
+        return self._convert_block_tokens(tokens)
 
     def _set_reference_definitions(self, definitions):
         self._reference_definitions = definitions
@@ -250,9 +252,9 @@ class ZendeskHelpMdParser(MdParser):
     _STEPS_PATTERN = re.compile(r'(?s)<steps>(?P<content>.*?)</steps>')
     _TABS_PATTERN = re.compile(r'(?s)<tabs>(?P<content>.*?)</tabs>')
 
-    def parse(self, text):
+    def parse(self, text, rules=None):
         nodes = self._parse_nodes(text)
-        return Root(nodes)
+        return nodes
 
     def _parse_nodes(self, text: str):
         nodes = []
@@ -425,9 +427,9 @@ def _remove_ltr_rtl_marks(text):
 def parse(text, parser_cls: type[MdParser] = MdParser):
     text = _remove_spaces_from_empty_lines(text)
     text = _remove_ltr_rtl_marks(text)
-    text, reference_definitions = _extract_reference_definitions(text)
     parser = parser_cls()
     if hasattr(parser, '_set_reference_definitions'):
+        text, reference_definitions = _extract_reference_definitions(text)
         parser._set_reference_definitions(reference_definitions)
     result = parser.parse(text)
     if isinstance(result, list):
diff --git a/tests/test_compare.py b/tests/test_compare.py
index 496c15d..81f7135 100644
--- a/tests/test_compare.py
+++ b/tests/test_compare.py
@@ -19,6 +19,10 @@ def test_not_equal_links(self):
         _, _, actual = diff_links(trees.pa(), trees.paa())
         self.assertEqual('dummy link 2', actual[0].node.text)
 
+    def test_non_link_structure_diffs_returned(self):
+        _, _, errors = diff_links(trees.r2t(), trees.pt())
+        self.assertTrue(any(error.node.name == 'header' for error in errors))
+
 
 class TestEqual(TestCase):
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 20f8410..11e9620 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -263,7 +263,10 @@ def test_remove_ltr_rtl_marks(self):
 
 
 class DummyParser:
-    def parse(self, text):
+    last_text = None
+
+    def parse(self, text, rules=None):
+        DummyParser.last_text = text
         return [Paragraph([Text(text)])]
 
 
@@ -272,3 +275,13 @@ def test_wraps_list_parser_output(self):
         tree = parser.parse('hello', parser_cls=DummyParser)
         self.assertIsInstance(tree, Root)
         self.assertEqual('pt', tree.print_all())
+
+    def test_custom_parser_input_not_mutated_by_ref_defs(self):
+        data = 'See [API][id].\n\n[id]: https://example.com'
+        parser.parse(data, parser_cls=DummyParser)
+        self.assertIn('[id]: https://example.com', DummyParser.last_text)
+
+    def test_mdparser_parse_accepts_rules_argument(self):
+        md_parser = MdParser()
+        nodes = md_parser.parse('1. one', MdParser.list_rules)
+        self.assertIsInstance(nodes, list)
diff --git a/tests/test_sdiff.py b/tests/test_sdiff.py
index 3783a8a..db8bf45 100644
--- a/tests/test_sdiff.py
+++ b/tests/test_sdiff.py
@@ -46,7 +46,9 @@ def test_ignores_link_content(self):
     def test_missing_link_is_reported(self):
         left = 'text [Link](http://example.com)'
         right = 'text'
-        _, _, errors = sdiff.diff(left, right)
+        tree1 = sdiff.parse(left)
+        tree2 = sdiff.parse(right)
+        _, _, errors = sdiff.diff_links(tree1, tree2)
         self.assertTrue(any(error.node.name == 'link' for error in errors))
 
     def test_extra_paragraph_has_paragraph_error(self):

From 73782e83b278f828786d27ebd7ed121c0bf1bb1e Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Tue, 20 Jan 2026 18:37:01 -0800
Subject: [PATCH 10/18] =?UTF-8?q?=F0=9F=93=9D=20docs(api):=20add=20public?=
 =?UTF-8?q?=20docstrings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/__init__.py | 11 +++++++++++
 sdiff/compare.py  |  2 ++
 sdiff/parser.py   | 15 +++++++++++++++
 sdiff/renderer.py |  2 ++
 4 files changed, 30 insertions(+)

diff --git a/sdiff/__init__.py b/sdiff/__init__.py
index cde6b1f..85b6af4 100644
--- a/sdiff/__init__.py
+++ b/sdiff/__init__.py
@@ -4,6 +4,17 @@
 
 
 def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParser):
+    """Compare two Markdown strings by structure and return rendered outputs + errors.
+
+    Args:
+        md1: Left Markdown string.
+        md2: Right Markdown string.
+        renderer: Renderer instance used to format the output (TextRenderer by default).
+        parser_cls: Parser class to use (MdParser by default).
+
+    Returns:
+        (rendered_left, rendered_right, errors)
+    """
     tree1 = parse(md1, parser_cls)
     tree2 = parse(md2, parser_cls)
 
diff --git a/sdiff/compare.py b/sdiff/compare.py
index 5958ada..34d75ca 100644
--- a/sdiff/compare.py
+++ b/sdiff/compare.py
@@ -44,8 +44,10 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None):
 
 
 def diff_links(tree1, tree2):
+    """Diff only link-relevant structure (paragraphs/headers/lists/links)."""
     return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a'])
 
 
 def diff_struct(tree1, tree2):
+    """Diff overall structure, ignoring link and image content."""
     return _diff(tree1, tree2, exclude_symbols=['a', 'i'])
diff --git a/sdiff/parser.py b/sdiff/parser.py
index 6d06f81..1ecbbf4 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -22,6 +22,10 @@
 
 
 class MdParser:
+    """Markdown parser that builds a lightweight structural tree.
+
+    Uses Mistune AST tokens to build sdiff Node objects.
+    """
     list_rules = None
 
     @classmethod
@@ -33,6 +37,15 @@ def __init__(self):
         self._reference_definitions = {}
 
     def parse(self, text, rules=None):
+        """Parse Markdown text into a list of Node objects.
+
+        Args:
+            text: Markdown string.
+            rules: Optional rules argument kept for compatibility.
+
+        Returns:
+            list[Node]
+        """
         tokens = self._markdown(text)
         return self._convert_block_tokens(tokens)
 
@@ -253,6 +266,7 @@ class ZendeskHelpMdParser(MdParser):
     _TABS_PATTERN = re.compile(r'(?s)<tabs>(?P<content>.*?)</tabs>')
 
     def parse(self, text, rules=None):
+        """Parse Markdown with Zendesk tag support into a list of Node objects."""
         nodes = self._parse_nodes(text)
         return nodes
 
@@ -425,6 +439,7 @@ def _remove_ltr_rtl_marks(text):
 
 
 def parse(text, parser_cls: type[MdParser] = MdParser):
+    """Parse Markdown into a Root node using the given parser class."""
     text = _remove_spaces_from_empty_lines(text)
     text = _remove_ltr_rtl_marks(text)
     parser = parser_cls()
diff --git a/sdiff/renderer.py b/sdiff/renderer.py
index 0d87a66..40bbd72 100644
--- a/sdiff/renderer.py
+++ b/sdiff/renderer.py
@@ -2,6 +2,7 @@
 
 
 class HtmlRenderer:
+    """Render a tree to HTML, marking inserts/deletes with ins/del tags."""
 
     def render(self, tree: Root):
         result = tree.original(self)
@@ -16,6 +17,7 @@ def render_node(self, node, text):
 
 
 class TextRenderer:
+    """Render a tree back to plain text/Markdown-like output."""
 
     def render(self, tree: Root):
         result = tree.original(self)

From 7c9ad59af308a435fc16ec6bad849b0609820dfc Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Tue, 20 Jan 2026 20:00:40 -0800
Subject: [PATCH 11/18] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix(parser):=20re?=
 =?UTF-8?q?store=20list=20ref=20defs=20and=20quote=20fences?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/parser.py      | 46 +++++++++++++++++++++++++++++++++++++++++---
 tests/test_parser.py | 14 ++++++++++++++
 2 files changed, 57 insertions(+), 3 deletions(-)

diff --git a/sdiff/parser.py b/sdiff/parser.py
index 1ecbbf4..f168916 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -72,7 +72,11 @@ def _convert_block_token(self, token):
             return [self._convert_paragraph_or_heading(token.get('children', []))]
         if token_type == 'block_html':
             return self._convert_block_html(token)
-        if token_type in {'thematic_break', 'block_quote', 'block_code', 'fenced_code'}:
+        if token_type == 'block_quote':
+            return self._convert_block_quote(token)
+        if token_type == 'block_code':
+            return self._convert_block_code(token)
+        if token_type == 'thematic_break':
             return self._convert_passthrough_block(token)
         return self._convert_passthrough_block(token)
 
@@ -109,6 +113,37 @@ def _convert_passthrough_block(self, token):
             return [Paragraph([Text(mistune.escape(raw))])]
         return []
 
+    def _convert_block_quote(self, token):
+        children = token.get('children', [])
+        if not children:
+            return []
+        content = self._render_inline_children(children)
+        if not content.strip():
+            return []
+        lines = content.splitlines()
+        quoted = '\n'.join([f'> {line}' if line.strip() else '>' for line in lines])
+        return [Paragraph([Text(mistune.escape(quoted))])]
+
+    def _convert_block_code(self, token):
+        raw = token.get('raw') or ''
+        marker = token.get('marker') or '```'
+        fence = marker if marker else '```'
+        content = raw.rstrip('\n')
+        code_block = f'{fence}\n{content}\n{fence}'
+        return [Paragraph([Text(mistune.escape(code_block))])]
+
+    def _render_inline_children(self, children):
+        parts = []
+        for child in children:
+            child_type = child.get('type')
+            if child_type in {'paragraph', 'block_text'}:
+                parts.append(self._flatten_inline_text(child.get('children', [])))
+            else:
+                raw = child.get('raw') or child.get('text') or ''
+                if raw:
+                    parts.append(raw)
+        return '\n'.join([part for part in parts if part is not None])
+
     def _convert_list_item(self, token):
         item = ListItem()
         for child in token.get('children', []):
@@ -132,7 +167,8 @@ def flush_buffer():
         for token in tokens:
             token_type = token.get('type')
             if token_type in {'text', 'inline_html', 'block_html'}:
-                buffer += token.get('raw', '')
+                raw = token.get('raw', '')
+                buffer += self._reference_definitions.get(raw, raw)
             elif token_type == 'codespan':
                 buffer += f"`{token.get('raw') or token.get('text') or ''}`"
             elif token_type == 'softbreak':
@@ -180,7 +216,8 @@ def _flatten_inline_text(self, tokens: Iterable[dict]):
         for token in tokens:
             token_type = token.get('type')
             if token_type in {'text', 'inline_html', 'block_html'}:
-                parts.append(token.get('raw') or token.get('text') or '')
+                raw = token.get('raw') or token.get('text') or ''
+                parts.append(self._reference_definitions.get(raw, raw))
             elif token_type == 'codespan':
                 parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
             elif token_type in _INLINE_MARKERS:
@@ -207,6 +244,9 @@ def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]):
         return Paragraph(self._convert_inline_tokens(inline_tokens))
 
     def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]):
+        ref_text = self._reference_definition_text(inline_tokens)
+        if ref_text is not None:
+            return [Text(ref_text)]
         heading = self._heading_from_inline(inline_tokens)
         if heading:
             return [heading]
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 11e9620..5d1c9c7 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -95,6 +95,12 @@ def test_reference_definition_preserved(self):
         self.assertEqual('[API][id]', link.text)
         self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text)
 
+    def test_reference_definition_inside_list_item_preserved(self):
+        data = '- item\n  [id]: https://example.com'
+        tree = self._parse(data)
+        list_item = tree.nodes[0].nodes[0]
+        self.assertIn('[id]: https://example.com', list_item.nodes[0].text)
+
     def test_reference_links_with_whitespace_and_empty_id(self):
         data = 'See [API][] and [Ref] [id].\n\n[API]: https://example.com\n[id]: https://example.com'
         tree = self._parse(data)
@@ -122,6 +128,14 @@ def test_softbreak_preserves_space(self):
         actual = self._parse('hello\nworld')
         self.assertEqual('hello world', actual.nodes[0].nodes[0].text)
 
+    def test_block_quote_preserves_marker(self):
+        actual = self._parse('> quote')
+        self.assertEqual('&gt; quote', actual.nodes[0].nodes[0].text)
+
+    def test_fenced_code_preserves_fences(self):
+        actual = self._parse('```\ncode\n```')
+        self.assertEqual('```\ncode\n```', actual.nodes[0].nodes[0].text)
+
     def test_ordered_list_parses_as_ordered(self):
         tree = self._parse('1. one\n2. two')
         list_node = tree.nodes[0]

From f99ab86431e4d88caa6f07784bd4f62c9f1bf1b4 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Wed, 21 Jan 2026 13:57:20 -0800
Subject: [PATCH 12/18] =?UTF-8?q?=F0=9F=A7=B9=20refactor(parser):=20reduce?=
 =?UTF-8?q?=20inline=20handler=20complexity?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sdiff/parser.py | 115 +++++++++++++++++++++++++++++++-----------------
 1 file changed, 75 insertions(+), 40 deletions(-)

diff --git a/sdiff/parser.py b/sdiff/parser.py
index f168916..112831b 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -164,53 +164,88 @@ def flush_buffer():
                 self._split_reference_links(buffer, nodes)
                 buffer = ''
 
+        handlers = {
+            'text': self._handle_inline_text,
+            'inline_html': self._handle_inline_text,
+            'block_html': self._handle_inline_text,
+            'codespan': self._handle_inline_codespan,
+            'softbreak': self._handle_inline_softbreak,
+            'linebreak': self._handle_inline_linebreak,
+            'link': self._handle_inline_link,
+            'image': self._handle_inline_image,
+            'strong': self._handle_inline_marker,
+            'emphasis': self._handle_inline_marker,
+            'strikethrough': self._handle_inline_marker,
+        }
+
         for token in tokens:
             token_type = token.get('type')
-            if token_type in {'text', 'inline_html', 'block_html'}:
-                raw = token.get('raw', '')
-                buffer += self._reference_definitions.get(raw, raw)
-            elif token_type == 'codespan':
-                buffer += f"`{token.get('raw') or token.get('text') or ''}`"
-            elif token_type == 'softbreak':
-                buffer += ' '
-            elif token_type == 'linebreak':
-                flush_buffer()
-                nodes.append(NewLine())
-            elif token_type == 'link':
-                flush_buffer()
-                text = self._flatten_inline_text(token.get('children', []))
-                attrs = token.get('attrs', {})
-                url = attrs.get('url', '')
-                title = attrs.get('title')
-                nodes.append(Link(_format_link_markup(text, url, title)))
-            elif token_type == 'image':
-                flush_buffer()
-                alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
-                attrs = token.get('attrs', {})
-                url = attrs.get('url', '')
-                title = attrs.get('title')
-                nodes.append(Image(_format_image_markup(alt, url, title)))
-            elif token_type in _INLINE_MARKERS:
-                flush_buffer()
-                marker = _INLINE_MARKERS[token_type]
-                _append_text(nodes, marker)
-                children = token.get('children', [])
-                if children:
-                    nodes.extend(self._convert_inline_tokens(children))
-                _append_text(nodes, marker)
+            handler = handlers.get(token_type)
+            if handler:
+                buffer = handler(token, nodes, buffer, flush_buffer)
             else:
-                flush_buffer()
-                children = token.get('children', [])
-                if children:
-                    nodes.extend(self._convert_inline_tokens(children))
-                else:
-                    raw = token.get('raw') or token.get('text') or ''
-                    if raw.strip():
-                        _append_text(nodes, mistune.escape(raw))
+                buffer = self._handle_inline_other(token, nodes, buffer, flush_buffer)
 
         flush_buffer()
         return nodes
 
+    def _handle_inline_text(self, token, nodes, buffer, flush_buffer):
+        raw = token.get('raw', '')
+        buffer += self._reference_definitions.get(raw, raw)
+        return buffer
+
+    def _handle_inline_codespan(self, token, nodes, buffer, flush_buffer):
+        buffer += f"`{token.get('raw') or token.get('text') or ''}`"
+        return buffer
+
+    def _handle_inline_softbreak(self, token, nodes, buffer, flush_buffer):
+        buffer += ' '
+        return buffer
+
+    def _handle_inline_linebreak(self, token, nodes, buffer, flush_buffer):
+        flush_buffer()
+        nodes.append(NewLine())
+        return buffer
+
+    def _handle_inline_link(self, token, nodes, buffer, flush_buffer):
+        flush_buffer()
+        text = self._flatten_inline_text(token.get('children', []))
+        attrs = token.get('attrs', {})
+        url = attrs.get('url', '')
+        title = attrs.get('title')
+        nodes.append(Link(_format_link_markup(text, url, title)))
+        return buffer
+
+    def _handle_inline_image(self, token, nodes, buffer, flush_buffer):
+        flush_buffer()
+        alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
+        attrs = token.get('attrs', {})
+        url = attrs.get('url', '')
+        title = attrs.get('title')
+        nodes.append(Image(_format_image_markup(alt, url, title)))
+        return buffer
+
+    def _handle_inline_marker(self, token, nodes, buffer, flush_buffer):
+        flush_buffer()
+        marker = _INLINE_MARKERS[token.get('type')]
+        _append_text(nodes, marker)
+        children = token.get('children', [])
+        if children:
+            nodes.extend(self._convert_inline_tokens(children))
+        _append_text(nodes, marker)
+        return buffer
+
+    def _handle_inline_other(self, token, nodes, buffer, flush_buffer):
+        flush_buffer()
+        children = token.get('children', [])
+        if children:
+            nodes.extend(self._convert_inline_tokens(children))
+        else:
+            raw = token.get('raw') or token.get('text') or ''
+            if raw.strip():
+                _append_text(nodes, mistune.escape(raw))
+        return buffer
+
     def _flatten_inline_text(self, tokens: Iterable[dict]):
         parts = []
         for token in tokens:

From 9ee2cb81deff80e3d47f43a72979db77ac9c2695 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Wed, 21 Jan 2026 13:58:42 -0800
Subject: [PATCH 13/18] =?UTF-8?q?=F0=9F=93=9D=20docs:=20note=20CI/Pre-comm?=
 =?UTF-8?q?it=20lint=20parity?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 AGENTS.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/AGENTS.md b/AGENTS.md
index 09070b5..526549e 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -14,6 +14,8 @@ The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tes
 - `make clean` removes build artifacts and the venv.
 - `make hooks` installs Husky git hooks (requires Node/npm; `make dev` runs this).
 
+Lint parity: CI and the Husky pre-commit hook both run the same checks as `make flake` (autopep8 check + flake8). Run `make flake` or `make test` locally to mirror CI.
+
 Example flow:
 ```sh
 make dev

From a8fb410ce085e9f90a018acc99ae599f5d85148e Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Thu, 5 Feb 2026 21:42:13 -0800
Subject: [PATCH 14/18] fixes: match master structure diff semantics

---
 sdiff/parser.py      | 966 ++++++++++++++++++++++++++++++++++++++++---
 tests/test_parser.py |  91 +++-
 tests/test_sdiff.py  |  31 ++
 3 files changed, 1018 insertions(+), 70 deletions(-)

diff --git a/sdiff/parser.py b/sdiff/parser.py
index 112831b..765ef12 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -1,6 +1,7 @@
 import re
 import textwrap
 from typing import Iterable
+from urllib.parse import unquote
 
 import mistune
 from mistune import block_parser
@@ -9,17 +10,135 @@
                     Text, Header, ZendeskHelpCallout, ZendeskHelpSteps,
                     ZendeskHelpTabs)
 
-_BLOCK_TAGS = {tag.lower() for tag in block_parser.BLOCK_TAGS}
 _HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)')
+_ATX_HEADING_NO_SPACE_RE = re.compile(r'^(\s{0,3})(#{1,6})(?!#)(?=\S)')
+_LIST_ITEM_ATX_HEADING_NO_SPACE_RE = re.compile(r'^(\s{0,3}(?:[*+-]|\d+[.)])\s+)(#{1,6})(?!#)(?=\S)')
+_LIST_MARKER_RE = re.compile(r'^\s{0,3}(?:[*+-]|\d+[.)])\s+')
+_ORDERED_LIST_MARKER_RE = re.compile(r'^\s{0,3}(\d+)[.)]\s+')
 _REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]')
 _REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+')
 _FENCE_RE = re.compile(r'^\s*(`{3,}|~{3,})')
+_FENCE_ONLY_LINE_RE = re.compile(r'^\s*(`{3,}|~{3,})\s*$')
+_BLOCKQUOTE_LINE_RE = re.compile(r'^\s{0,3}>\s?.*')
+_MISTUNE08_FENCE_BLOCK_RE = re.compile(
+    r'^ *(`{3,}|~{3,}) *(\S+)? *\n'  # opening fence (+ optional info)
+    r'([\s\S]+?)\s*'                 # content (must be non-empty; mistune 0.x quirk)
+    r'\1 *(?:\n+|$)',                # closing fence
+    flags=re.M,
+)
 _INLINE_MARKERS = {
     'strong': '**',
     'emphasis': '*',
     'strikethrough': '~~',
 }
 
+_LEGACY_INLINE_TAGS = {
+    # Copied from mistune 0.8.1's `_block_tag` negative lookahead.
+    'a',
+    'em',
+    'strong',
+    'small',
+    's',
+    'cite',
+    'q',
+    'dfn',
+    'abbr',
+    'data',
+    'time',
+    'code',
+    'var',
+    'samp',
+    'kbd',
+    'sub',
+    'sup',
+    'i',
+    'b',
+    'u',
+    'mark',
+    'ruby',
+    'rt',
+    'rp',
+    'bdi',
+    'bdo',
+    'span',
+    'br',
+    'wbr',
+    'ins',
+    'del',
+    'img',
+    'font',
+}
+
+_MISTUNE_BLOCK_OR_PRE_TAGS = set(block_parser.BLOCK_TAGS) | set(block_parser.PRE_TAGS)
+
+_LEGACY_VALID_ATTR_RE = r"\s*[a-zA-Z\-](?:\=(?:\"[^\"]*\"|'[^']*'|[^\s'\">]+))?"
+_LEGACY_BLOCK_TAG_RE = (
+    r"(?!(?:%s)\b)\w+(?!:/|[^\w\s@]*@)\b" % "|".join(sorted(_LEGACY_INLINE_TAGS))
+)
+_LEGACY_BLOCK_HTML_RE = re.compile(
+    r'^\s* *(?:'
+    r'<!--[\s\S]*?-->'
+    r'|<(' + _LEGACY_BLOCK_TAG_RE + r')((?:' + _LEGACY_VALID_ATTR_RE + r')*?)>([\s\S]+?)<\/\1>'
+    r'|<' + _LEGACY_BLOCK_TAG_RE + r'(?:' + _LEGACY_VALID_ATTR_RE + r')*?>'
+    r') *(?:\n{1,}|\s*$)'
+)
+
+
+def _split_legacy_block_html(raw: str) -> tuple[str, str] | None:
+    """Split over-greedy HTML blocks produced by mistune 3.
+
+    Mistune 0.x treats a line like `<callout>` as a single HTML block and continues parsing
+    following Markdown lines. Mistune 3 follows CommonMark and may consume subsequent lines
+    until a blank line, which changes our structural tree.
+    """
+    if not raw or '\n' not in raw:
+        return None
+    match = _LEGACY_BLOCK_HTML_RE.match(raw)
+    if match is None:
+        return None
+    end = match.end()
+    if end >= len(raw):
+        return None
+    return raw[:end], raw[end:]
+
+
+class _SdiffBlockParser(block_parser.BlockParser):
+    """Mistune block parser tweaked for legacy-compat structure diffs.
+
+    The master branch (mistune 0.x) did not treat fenced code blocks or block quotes
+    as special blocks. We disable them so they are parsed as normal text and then
+    normalized in our conversion layer.
+    """
+
+    def parse_fenced_code(self, m, state):  # noqa: ANN001
+        return None
+
+    def parse_block_quote(self, m, state):  # noqa: ANN001
+        return None
+
+    def parse_raw_html(self, m, state):  # noqa: ANN001
+        """Parse raw HTML more like mistune 0.x.
+
+        In mistune 3, unknown tags are "type 7" HTML blocks and may not interrupt
+        paragraphs. The legacy mistune 0.x parser used in `master` treats any
+        non-inline tag as block HTML and it can interrupt paragraphs.
+        """
+        marker = m.group(0).strip()
+
+        # Legacy parser does not recognize closing tags alone as block HTML.
+        if marker.startswith('</'):
+            return None
+
+        # Defer to the upstream implementation for comments and other directives.
+        if marker in {'<!--', '<?', '<![CDATA['} or marker.startswith('<!'):
+            return super().parse_raw_html(m, state)
+
+        open_tag = marker[1:].lower()
+        if open_tag and open_tag not in _MISTUNE_BLOCK_OR_PRE_TAGS and open_tag not in _LEGACY_INLINE_TAGS:
+            return block_parser._parse_html_to_newline(state, self.BLANK_LINE)
+
+        return super().parse_raw_html(m, state)
+
 
 class MdParser:
     """Markdown parser that builds a lightweight structural tree.
@@ -33,7 +152,24 @@ def get_lexer(cls):
         return cls()
 
     def __init__(self):
-        self._markdown = mistune.create_markdown(renderer='ast')
+        block = _SdiffBlockParser()
+        # Don't recognize fences/quotes as block-level syntax; see _SdiffBlockParser.
+        for rule in ('fenced_code', 'block_quote'):
+            if rule in block.rules:
+                block.rules.remove(rule)
+
+        # In mistune 0.x the list parser does not include the `block_html` / `raw_html`
+        # rule, so HTML-like lines inside list items become plain text (not Html nodes)
+        # and don't swallow following Markdown.
+        if 'raw_html' in getattr(block, 'list_rules', []):
+            block.list_rules.remove('raw_html')
+
+        inline = mistune.InlineParser()
+        # Prevent code spans from consuming legacy fence markers like ```...```.
+        if 'codespan' in inline.rules:
+            inline.rules.remove('codespan')
+
+        self._markdown = mistune.Markdown(renderer=None, block=block, inline=inline)
         self._reference_definitions = {}
 
     def parse(self, text, rules=None):
@@ -61,7 +197,7 @@ def _convert_block_tokens(self, tokens: Iterable[dict]):
     def _convert_block_token(self, token):
         token_type = token.get('type')
         if token_type == 'paragraph':
-            return [self._convert_paragraph_or_heading(token.get('children', []))]
+            return self._convert_paragraph_token(token.get('children', []))
         if token_type == 'heading':
             return [self._convert_heading(token)]
         if token_type == 'list':
@@ -98,7 +234,14 @@ def _convert_list(self, token):
     def _convert_block_html(self, token):
         raw = token.get('raw', '')
         if _is_block_html(raw):
-            return [Html(raw)]
+            split = _split_legacy_block_html(raw)
+            if split is None:
+                return [Html(raw)]
+            prefix, suffix = split
+            nodes = [Html(prefix)]
+            if suffix and suffix.strip():
+                nodes.extend(self._convert_block_tokens(self._markdown(suffix)))
+            return nodes
         text = mistune.escape(raw)
         if text.strip():
             return [Paragraph([Text(text)])]
@@ -150,10 +293,51 @@ def _convert_list_item(self, token):
             child_type = child.get('type')
             if child_type in {'block_text', 'paragraph'}:
                 item.add_nodes(self._convert_list_block_nodes(child.get('children', [])))
+            elif child_type == 'block_html':
+                item.add_nodes(self._convert_list_item_block_html(child))
             else:
                 item.add_nodes(self._convert_block_tokens([child]))
         return item
 
+    def _convert_list_item_block_html(self, token):
+        # In mistune 0.x the list parser does not include the `block_html` rule,
+        # so HTML-like lines inside list items become plain text (not Html nodes).
+        raw = token.get('raw', '') or ''
+        if not raw.strip():
+            return []
+
+        split = _split_legacy_block_html(raw)
+        if split is None:
+            prefix, suffix = raw, ''
+        else:
+            prefix, suffix = split
+
+        nodes = []
+        _append_text(nodes, mistune.escape(prefix))
+        if suffix and suffix.strip():
+            nodes.extend(self._convert_list_item_block_html_text(suffix))
+        return nodes
+
+    def _convert_list_item_block_html_text(self, text: str):
+        nodes = []
+        for child in self._markdown(text):
+            child_type = child.get('type')
+            if child_type in {'block_text', 'paragraph'}:
+                nodes.extend(self._convert_list_block_nodes(child.get('children', [])))
+            elif child_type == 'heading':
+                nodes.append(self._convert_heading(child))
+            elif child_type == 'list':
+                nodes.append(self._convert_list(child))
+            elif child_type == 'list_item':
+                nodes.append(self._convert_list_item(child))
+            elif child_type == 'block_html':
+                nodes.extend(self._convert_list_item_block_html(child))
+            else:
+                raw = child.get('raw') or child.get('text') or ''
+                if raw.strip():
+                    _append_text(nodes, mistune.escape(raw))
+        return nodes
+
     def _convert_inline_tokens(self, tokens: Iterable[dict]):
         nodes = []
         buffer = ''
@@ -161,7 +345,8 @@ def _convert_inline_tokens(self, tokens: Iterable[dict]):
         def flush_buffer():
             nonlocal buffer
             if buffer:
-                self._split_reference_links(buffer, nodes)
+                for part in _split_text_on_legacy_markers(buffer):
+                    self._split_reference_links(part, nodes)
                 buffer = ''
 
         handlers = {
@@ -205,25 +390,25 @@ def _handle_inline_softbreak(self, token, nodes, buffer, flush_buffer):
     def _handle_inline_linebreak(self, token, nodes, buffer, flush_buffer):
         flush_buffer()
         nodes.append(NewLine())
-        return buffer
+        return ''
 
     def _handle_inline_link(self, token, nodes, buffer, flush_buffer):
         flush_buffer()
         text = self._flatten_inline_text(token.get('children', []))
         attrs = token.get('attrs', {})
-        url = attrs.get('url', '')
+        url = _unquote_url_if_template(attrs.get('url', ''))
         title = attrs.get('title')
         nodes.append(Link(_format_link_markup(text, url, title)))
-        return buffer
+        return ''
 
     def _handle_inline_image(self, token, nodes, buffer, flush_buffer):
         flush_buffer()
         alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
         attrs = token.get('attrs', {})
-        url = attrs.get('url', '')
+        url = _unquote_url_if_template(attrs.get('url', ''))
         title = attrs.get('title')
         nodes.append(Image(_format_image_markup(alt, url, title)))
-        return buffer
+        return ''
 
     def _handle_inline_marker(self, token, nodes, buffer, flush_buffer):
         flush_buffer()
@@ -233,7 +418,7 @@ def _handle_inline_marker(self, token, nodes, buffer, flush_buffer):
         if children:
             nodes.extend(self._convert_inline_tokens(children))
         _append_text(nodes, marker)
-        return buffer
+        return ''
 
     def _handle_inline_other(self, token, nodes, buffer, flush_buffer):
         flush_buffer()
@@ -244,7 +429,7 @@ def _handle_inline_other(self, token, nodes, buffer, flush_buffer):
             raw = token.get('raw') or token.get('text') or ''
             if raw.strip():
                 _append_text(nodes, mistune.escape(raw))
-        return buffer
+        return ''
 
     def _flatten_inline_text(self, tokens: Iterable[dict]):
         parts = []
@@ -278,14 +463,161 @@ def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]):
             return heading
         return Paragraph(self._convert_inline_tokens(inline_tokens))
 
-    def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]):
+    def _convert_paragraph_token(self, inline_tokens: Iterable[dict]):
         ref_text = self._reference_definition_text(inline_tokens)
         if ref_text is not None:
-            return [Text(ref_text)]
+            return [Paragraph([Text(ref_text)])]
         heading = self._heading_from_inline(inline_tokens)
         if heading:
             return [heading]
-        return self._convert_inline_tokens(inline_tokens)
+
+        split = self._split_paragraph_inline_on_fence(inline_tokens)
+        if split is not None:
+            nodes = []
+            for part in split:
+                children = self._convert_inline_tokens(part)
+                if children:
+                    nodes.append(Paragraph(children))
+            if nodes:
+                return nodes
+
+        return [Paragraph(self._convert_inline_tokens(inline_tokens))]
+
+    def _split_paragraph_inline_on_fence(self, inline_tokens: Iterable[dict]):
+        # Legacy mistune 0.x breaks paragraphs when it encounters a fence-only marker
+        # line (``` / ~~~), even though we treat fences as plain text blocks.
+        if not inline_tokens:
+            return None
+
+        lines = [[]]
+        seps = []
+        for token in inline_tokens:
+            token_type = token.get('type')
+            if token_type in {'softbreak', 'linebreak'}:
+                seps.append(token)
+                lines.append([])
+            else:
+                lines[-1].append(token)
+
+        if len(lines) <= 1:
+            return None
+
+        line_texts = [self._flatten_inline_markup(line) for line in lines]
+
+        def fence_marker(tokens):
+            raw = self._flatten_inline_markup(tokens).strip()
+            match = _FENCE_ONLY_LINE_RE.match(raw)
+            if match is None:
+                return None
+            return match.group(1)
+
+        if fence_marker(lines[0]) is not None:
+            return None
+
+        split_idx = None
+        for idx in range(1, len(lines)):
+            marker = fence_marker(lines[idx])
+            if marker is None:
+                continue
+            # Only split when this fence line begins a complete fence block according
+            # to mistune 0.x's `fences` regex. This avoids breaking on sequences like
+            # ```\n``` which mistune 0.x does not treat as a fence block (no content).
+            tail = '\n'.join(line_texts[idx:])
+            if _MISTUNE08_FENCE_BLOCK_RE.match(tail):
+                split_idx = idx
+                break
+
+        if split_idx is None:
+            return None
+
+        first = []
+        for idx, line in enumerate(lines[:split_idx]):
+            first.extend(line)
+            if idx < split_idx - 1:
+                first.append(seps[idx])
+
+        second = []
+        for line_idx in range(split_idx, len(lines)):
+            second.extend(lines[line_idx])
+            if line_idx < len(lines) - 1:
+                second.append(seps[line_idx])
+
+        parts = []
+        if first:
+            parts.append(first)
+        if second:
+            parts.append(second)
+        return parts if len(parts) > 1 else None
+
+    def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]):
+        text = self._flatten_inline_markup(inline_tokens, softbreak_as_newline=True)
+        if not text or not text.strip():
+            return []
+
+        nodes = []
+        for line in text.splitlines():
+            if not line.strip():
+                continue
+
+            ref_text = self._reference_definitions.get(line)
+            if ref_text is not None:
+                nodes.append(Text(ref_text))
+                continue
+
+            heading = self._heading_from_inline([{'type': 'text', 'raw': line}])
+            if heading:
+                nodes.append(heading)
+                continue
+
+            nodes.append(Text(mistune.escape(line)))
+
+        return nodes
+
+    def _flatten_inline_markup(self, tokens: Iterable[dict], *, softbreak_as_newline: bool = False):
+        parts = []
+        for token in tokens:
+            token_type = token.get('type')
+            if token_type in {'text', 'inline_html', 'block_html'}:
+                raw = token.get('raw') or token.get('text') or ''
+                parts.append(self._reference_definitions.get(raw, raw))
+            elif token_type == 'link':
+                label = self._flatten_inline_markup(
+                    token.get('children', []),
+                    softbreak_as_newline=softbreak_as_newline,
+                )
+                attrs = token.get('attrs', {})
+                url = _unquote_url_if_template(attrs.get('url', ''))
+                title = attrs.get('title')
+                parts.append(_format_link_markup(label, url, title))
+            elif token_type == 'image':
+                alt = token.get('attrs', {}).get('alt') or self._flatten_inline_markup(
+                    token.get('children', []),
+                    softbreak_as_newline=softbreak_as_newline,
+                )
+                attrs = token.get('attrs', {})
+                url = _unquote_url_if_template(attrs.get('url', ''))
+                title = attrs.get('title')
+                parts.append(_format_image_markup(alt, url, title))
+            elif token_type == 'softbreak':
+                parts.append('\n' if softbreak_as_newline else ' ')
+            elif token_type == 'linebreak':
+                parts.append('\n')
+            elif token_type == 'codespan':
+                parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
+            elif token_type in _INLINE_MARKERS:
+                marker = _INLINE_MARKERS[token_type]
+                inner = self._flatten_inline_markup(
+                    token.get('children', []),
+                    softbreak_as_newline=softbreak_as_newline,
+                )
+                parts.append(f'{marker}{inner}{marker}')
+            else:
+                children = token.get('children', [])
+                if children:
+                    parts.append(self._flatten_inline_markup(children, softbreak_as_newline=softbreak_as_newline))
+                else:
+                    parts.append(token.get('raw') or token.get('text') or '')
+        return ''.join(parts)
 
     def _heading_from_inline(self, inline_tokens: Iterable[dict]):
         if len(inline_tokens) != 1:
@@ -334,11 +666,12 @@ def _split_reference_links(self, raw: str, nodes):
 
 
 class ZendeskHelpMdParser(MdParser):
-    _CALLOUT_PATTERN = re.compile(
-        r'(?s)<callout(?:\s+(?P<style>green|red|yellow))?>(?P<content>.*?)</callout>'
-    )
-    _STEPS_PATTERN = re.compile(r'(?s)<steps>(?P<content>.*?)</steps>')
-    _TABS_PATTERN = re.compile(r'(?s)<tabs>(?P<content>.*?)</tabs>')
+    _CALLOUT_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*<callout(?P<attrs>[^>]*)>(?P<content>.*?)</callout>')
+    _CALLOUT_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*<callout(?P<attrs>[^>]*)>(?P<content>.*)</callout>')
+    _STEPS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*<steps>(?P<content>.*?)</steps>')
+    _STEPS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*<steps>(?P<content>.*)</steps>')
+    _TABS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*<tabs>(?P<content>.*?)</tabs>')
+    _TABS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*<tabs>(?P<content>.*)</tabs>')
 
     def parse(self, text, rules=None):
         """Parse Markdown with Zendesk tag support into a list of Node objects."""
@@ -349,7 +682,22 @@ def _parse_nodes(self, text: str):
         nodes = []
         remaining = text
         while remaining:
-            tag_name, match = self._find_next_tag(remaining)
+            tag_name = None
+            match = None
+            search_at = 0
+            while True:
+                tag_name, match = self._find_next_tag(remaining, start_at=search_at)
+                if not match:
+                    break
+                absolute_start = (len(text) - len(remaining)) + match.start()
+                if _is_inside_list_block(text, absolute_start):
+                    # The legacy mistune 0.x list parser treats block-level content
+                    # lazily; Zendesk tags that appear inside list items become plain
+                    # text and are not recognized structurally. Avoid splitting the
+                    # input at such tags, since that would terminate the list early.
+                    search_at = match.start() + 1
+                    continue
+                break
             if not match:
                 nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining)))
                 break
@@ -358,9 +706,46 @@ def _parse_nodes(self, text: str):
                 prefix = remaining[:match.start()]
                 nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix)))
 
-            content = match.group('content')
+            # The legacy parser only recognizes Zendesk tags when they consume the
+            # remainder of the current parsing slice (it uses `\\s*$` in the rule
+            # regex). Because of this, it will also match *across* multiple tag
+            # blocks of the same kind if the last closing tag is at the end.
+            #
+            # We emulate this by preferring a greedy match when it is terminal.
+            terminal_match = None
+            tail = remaining[match.start():]
             if tag_name == 'callout':
-                node = ZendeskHelpCallout(match.group('style'))
+                m2 = self._CALLOUT_PATTERN_MAX.match(tail)
+            elif tag_name == 'steps':
+                m2 = self._STEPS_PATTERN_MAX.match(tail)
+            else:
+                m2 = self._TABS_PATTERN_MAX.match(tail)
+            if m2 is not None and not tail[m2.end():].strip():
+                terminal_match = m2
+
+            if terminal_match is None:
+                # Non-terminal: treat the first (minimal) tag block as opaque HTML.
+                nodes.append(Html(match.group(0)))
+                remaining = remaining[match.end():]
+                continue
+
+            content = terminal_match.group('content')
+            trailing = tail[terminal_match.end():]
+
+            if tag_name == 'callout':
+                attrs = (terminal_match.group('attrs') or '').strip()
+                styles = [part for part in attrs.split() if part]
+                if not styles:
+                    node = ZendeskHelpCallout(None)
+                elif len(styles) == 1 and styles[0] in {'green', 'red', 'yellow'}:
+                    node = ZendeskHelpCallout(styles[0])
+                else:
+                    # Invalid callout attrs: legacy parser does not treat this as a
+                    # Zendesk callout block. Keep the first (minimal) tag as opaque
+                    # HTML and continue parsing the remaining text.
+                    nodes.append(Html(match.group(0)))
+                    remaining = remaining[match.end():]
+                    continue
             elif tag_name == 'steps':
                 node = ZendeskHelpSteps()
             else:
@@ -369,26 +754,26 @@ def _parse_nodes(self, text: str):
             node.add_nodes(self._parse_nodes(content))
             nodes.append(node)
 
-            remaining = remaining[match.end():]
+            remaining = trailing
         return nodes
 
-    def _find_next_tag(self, text: str):
-        matches = []
+    def _find_next_tag(self, text: str, start_at: int = 0):
+        best = None
         for name, pattern in (
-            ('callout', self._CALLOUT_PATTERN),
-            ('steps', self._STEPS_PATTERN),
-            ('tabs', self._TABS_PATTERN),
+            ('callout', self._CALLOUT_PATTERN_MIN),
+            ('steps', self._STEPS_PATTERN_MIN),
+            ('tabs', self._TABS_PATTERN_MIN),
         ):
-            match = pattern.search(text)
-            if match:
-                matches.append((match.start(), name, match))
-        if not matches:
+            for match in pattern.finditer(text, start_at):
+                candidate = (match.start(), name, match)
+                if best is None or candidate[0] < best[0]:
+                    best = candidate
+                break
+
+        if best is None:
             return None, None
-        matches.sort(key=lambda item: item[0])
-        for _, name, match in matches:
-            if not _is_inside_fenced_block(text, match.start()):
-                return name, match
-        return None, None
+        _, name, match = best
+        return name, match
 
     def _parse_markdown(self, text: str):
         normalized = _remove_spaces_from_empty_lines(text)
@@ -397,12 +782,33 @@ def _parse_markdown(self, text: str):
 
 
 def _append_text(nodes, text):
-    if not text:
+    if not text or not text.strip():
         return
-    if nodes and isinstance(nodes[-1], Text):
-        nodes[-1].text += text
-    else:
-        nodes.append(Text(text))
+    nodes.append(Text(text))
+
+
+def _split_text_on_legacy_markers(raw: str) -> list[str]:
+    """Split text into segments similar to mistune 0.x inline text tokenization.
+
+    The legacy parser splits text at backticks and tildes (it stops before those
+    markers and then consumes them as separate text tokens). This matters for our
+    structural tree because each segment becomes its own Text node.
+    """
+    if not raw:
+        return []
+    markers = ('`', '~')
+    out = []
+    i = 0
+    n = len(raw)
+    while i < n:
+        j = n
+        for m in markers:
+            pos = raw.find(m, i + 1)
+            if pos != -1 and pos < j:
+                j = pos
+        out.append(raw[i:j])
+        i = j
+    return out
 
 
 def _format_title(title: str) -> str:
@@ -412,6 +818,21 @@ def _format_title(title: str) -> str:
     return f' "{escaped}"'
 
 
+def _unquote_url_if_template(url: str) -> str:
+    """Undo Mistune's percent-encoding for template-like URLs.
+
+    Mistune percent-encodes some characters in URLs (e.g. `{{url}}` becomes `%7B%7Burl%7D%7D`).
+    For structural diffs we don't care about URL contents, but we do want rendered markup to remain
+    readable and close to the original input.
+    """
+    if not url or '%' not in url:
+        return url
+    unquoted = unquote(url)
+    if unquoted != url and ('{' in unquoted or '}' in unquoted):
+        return unquoted
+    return url
+
+
 def _format_link_markup(text: str, url: str, title: str | None) -> str:
     return f'[{text}]({url}{_format_title(title)})'
 
@@ -427,7 +848,8 @@ def _is_block_html(raw: str) -> bool:
     match = re.match(r'<\/?\s*([a-zA-Z0-9]+)', stripped)
     if not match:
         return False
-    return match.group(1).lower() in _BLOCK_TAGS
+    tag = match.group(1).lower()
+    return tag not in _LEGACY_INLINE_TAGS
 
 
 def _normalize_block_indentation(text: str) -> str:
@@ -449,33 +871,212 @@ def _normalize_block_indentation(text: str) -> str:
     return '\n'.join(lines).strip()
 
 
+def _normalize_atx_heading_spaces(text: str) -> str:
+    """Normalize ATX headings that omit the mandatory space after the # markers.
+
+    Mistune 3 follows CommonMark and requires a space: `## Heading`. The legacy parser
+    (mistune 0.x) accepted `##Heading` and our fixtures rely on that.
+
+    We also normalize headings that appear right after list markers (e.g. `1. ##Heading`)
+    to keep list-item heading parsing compatible.
+    """
+    output = []
+    for line in text.splitlines(True):
+        match = _LIST_ITEM_ATX_HEADING_NO_SPACE_RE.match(line)
+        if match:
+            end = match.end(2)
+            line = f'{line[:end]} {line[end:]}'
+        else:
+            match = _ATX_HEADING_NO_SPACE_RE.match(line)
+            if match:
+                end = match.end(2)
+                line = f'{line[:end]} {line[end:]}'
+
+        output.append(line)
+    return ''.join(output)
+
+
+def _normalize_double_blank_line_list_nesting(text: str) -> str:
+    """Emulate mistune 0.x list nesting triggered by double blank lines.
+
+    The legacy parser nests a following list under the previous list item when there
+    are two consecutive blank lines between list marker lines. Mistune 3 does not
+    do this, so we indent the subsequent marker to force a nested list.
+    """
+    out = []
+    prev_nonblank_was_list = False
+    prev_list_indent = 0
+    blank_lines = 0
+    for line in text.splitlines(True):
+        if not line.strip():
+            blank_lines += 1
+            out.append(line)
+            continue
+
+        stripped = line.lstrip(' ')
+        current_indent = len(line) - len(stripped)
+        is_list = bool(_LIST_MARKER_RE.match(line))
+        if is_list and prev_nonblank_was_list and blank_lines >= 2:
+            desired_indent = prev_list_indent + 4
+            if current_indent < desired_indent:
+                line = (' ' * desired_indent) + stripped
+                current_indent = desired_indent
+
+        out.append(line)
+        prev_nonblank_was_list = is_list
+        if is_list:
+            prev_list_indent = current_indent
+        blank_lines = 0
+    return ''.join(out)
+
+
+def _normalize_ordered_list_marker_interrupts(text: str) -> str:
+    """Allow ordered list markers like `2.` to interrupt paragraphs (mistune 0.x compat).
+
+    Mistune 3 follows CommonMark and does not allow an ordered list starting with a
+    number other than 1 to interrupt a paragraph. Mistune 0.x is more permissive and
+    will start a list for `2.` / `3.` etc.
+
+    To emulate the legacy behavior we insert a blank line before such ordered list
+    marker lines when they immediately follow non-list, non-blank text and we're not
+    currently inside a list block.
+    """
+    out = []
+    in_list = False
+    pending_list_end = False
+    prev_blank = True
+    prev_was_list_marker = False
+
+    for line in text.splitlines():
+        if not line.strip():
+            out.append(line)
+            prev_blank = True
+            prev_was_list_marker = False
+            if in_list:
+                pending_list_end = True
+            continue
+
+        if pending_list_end:
+            if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line):
+                # Still inside the list block.
+                pass
+            else:
+                in_list = False
+            pending_list_end = False
+
+        if in_list and _REF_DEF_LINE_RE.match(line):
+            in_list = False
+
+        ordered = _ORDERED_LIST_MARKER_RE.match(line)
+        if not in_list and not prev_blank and ordered:
+            number = int(ordered.group(1))
+            if number != 1 and not prev_was_list_marker:
+                out.append('')
+                prev_blank = True
+                prev_was_list_marker = False
+
+        out.append(line)
+        prev_blank = False
+        prev_was_list_marker = bool(_LIST_MARKER_RE.match(line))
+        if prev_was_list_marker:
+            in_list = True
+
+    return '\n'.join(out)
+
+
+def _normalize_list_lazy_continuations(text: str) -> str:
+    """Emulate mistune 0.x lazy list continuations for block-start lines.
+
+    Mistune 3 follows CommonMark and will break a list when it encounters a
+    block-start line (e.g. `###### Heading`) that is not indented as a list-item
+    continuation. Mistune 0.x is much more permissive and will keep consuming
+    unindented lines as part of the current list item until the list is closed
+    by a blank line.
+
+    We emulate the legacy behavior by indenting unindented non-marker lines while
+    inside a list block so that mistune 3 keeps them as list-item continuation
+    lines.
+    """
+    out = []
+    in_list = False
+    pending_list_end = False
+    continue_prefix = ''
+
+    for raw_line in text.splitlines(True):
+        has_nl = raw_line.endswith('\n')
+        line = raw_line[:-1] if has_nl else raw_line
+
+        if not line.strip():
+            out.append(raw_line)
+            if in_list:
+                pending_list_end = True
+            continue
+
+        if pending_list_end:
+            if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line):
+                # Still inside the list block.
+                pass
+            else:
+                in_list = False
+                continue_prefix = ''
+            pending_list_end = False
+
+        marker_match = _LIST_MARKER_RE.match(line)
+        if marker_match:
+            in_list = True
+            continue_prefix = ' ' * marker_match.end()
+            out.append(raw_line)
+            continue
+
+        # Mistune 0.x list parsing stops before reference definition lines, even
+        # without blank lines. Treat those as list terminators so following blocks
+        # don't get indented into the list item.
+        if in_list and _REF_DEF_LINE_RE.match(line):
+            in_list = False
+            continue_prefix = ''
+            out.append(raw_line)
+            continue
+
+        if in_list and line[:1] not in {' ', '\t'}:
+            normalized = f'{continue_prefix}{line}'
+            if has_nl:
+                normalized += '\n'
+            out.append(normalized)
+            continue
+
+        out.append(raw_line)
+
+    return ''.join(out)
+
+
 def _extract_reference_definitions(text: str):
     lines = text.splitlines()
     output = []
     definitions = {}
-    fence = None
-    fence_len = 0
     counter = 0
-    for line in lines:
-        fence_match = _FENCE_RE.match(line)
-        if fence_match:
-            marker = fence_match.group(1)
-            marker_len = len(marker)
-            marker_char = marker[0]
-            if fence is None:
-                fence = marker_char
-                fence_len = marker_len
-            elif marker_char == fence and marker_len >= fence_len:
-                fence = None
-                fence_len = 0
-            output.append(line)
-            continue
-
-        if fence is None and _REF_DEF_LINE_RE.match(line):
+    for idx, line in enumerate(lines):
+        if _REF_DEF_LINE_RE.match(line):
             placeholder = f"SDIFF_REF_DEF_{counter}"
             counter += 1
             definitions[placeholder] = line.strip()
+            # The legacy parser treats reference definition lines as their own blocks
+            # (even without blank lines) and they must also not become lazy-continuation
+            # lines inside list items. Force block separation.
+            if output and output[-1].strip():
+                output.append('')
             output.append(placeholder)
+            # Special-case: When a reference definition is followed by a fence-only line,
+            # and after blank lines another fence-only line begins, mistune 0.x tends to
+            # split the ref def into its own paragraph (it doesn't keep it glued to the
+            # closing fence marker). Insert a blank line after the placeholder to match.
+            if idx + 1 < len(lines) and _FENCE_ONLY_LINE_RE.match(lines[idx + 1]):
+                j = idx + 2
+                # Only split when there is at least one blank line between fences.
+                if j < len(lines) and not lines[j].strip():
+                    while j < len(lines) and not lines[j].strip():
+                        j += 1
+                    if j < len(lines) and _FENCE_ONLY_LINE_RE.match(lines[j]):
+                        output.append('')
             continue
 
         output.append(line)
@@ -505,6 +1106,59 @@ def _is_inside_fenced_block(text: str, offset: int) -> bool:
     return False
 
 
+def _is_inside_list_block(text: str, offset: int) -> bool:
+    """Best-effort mistune 0.x list-block detection.
+
+    Mistune 0.x list parsing is permissive and supports lazy continuation lines.
+    For compatibility we treat everything following a list marker as being inside
+    the list block until a blank line is followed by a non-indented, non-list
+    marker line.
+
+    We also treat reference definition lines as list terminators even without
+    blank lines (legacy behavior).
+    """
+    in_list = False
+    pending_list_end = False
+    running = 0
+
+    for raw_line in text.splitlines(True):
+        line_len = len(raw_line)
+        line = raw_line[:-1] if raw_line.endswith('\n') else raw_line
+
+        if not line.strip():
+            if in_list:
+                pending_list_end = True
+            if running + line_len > offset:
+                return in_list
+            running += line_len
+            continue
+
+        if pending_list_end:
+            if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line):
+                # Still inside the list block.
+                pass
+            else:
+                in_list = False
+            pending_list_end = False
+
+        # Mistune 0.x list parsing stops before reference definition lines, even
+        # without blank lines.
+        if in_list and _REF_DEF_LINE_RE.match(line):
+            in_list = False
+
+        line_is_list_marker = bool(_LIST_MARKER_RE.match(line))
+        line_in_list = in_list or line_is_list_marker
+        if running + line_len > offset:
+            return line_in_list
+
+        if line_is_list_marker:
+            in_list = True
+
+        running += line_len
+
+    return False
+
+
 def _remove_spaces_from_empty_lines(text):
     return '\n'.join([re.sub(r'^( {1,}|\t{1,})$', '\n', line) for line in text.splitlines()])
 
@@ -513,15 +1167,201 @@ def _remove_ltr_rtl_marks(text):
     return re.sub(r'(\u200e|\u200f)', '', text)
 
 
+def _normalize_consecutive_fence_lines(text: str) -> str:
+    """Split consecutive fence-marker lines into separate blocks.
+
+    The legacy parser tends to break paragraphs at repeated fence marker lines
+    like:
+        ~~~~
+        ~~~~
+    We insert a blank line between consecutive fence-only lines to keep block
+    structure compatible.
+    """
+    out = []
+    prev_was_fence = False
+    for line in text.splitlines():
+        is_fence = bool(_FENCE_ONLY_LINE_RE.match(line))
+        if is_fence and prev_was_fence and out and out[-1].strip():
+            out.append('')
+        out.append(line)
+        prev_was_fence = is_fence
+    return '\n'.join(out)
+
+
+def _normalize_consecutive_blockquote_lines(text: str) -> str:
+    """Split consecutive `>` quote lines into separate blocks.
+
+    Mistune 0.x tends to break paragraphs on each quote-marker line when block quote
+    syntax isn't enabled in the lexer. We emulate that by inserting blank lines
+    between consecutive quote lines.
+    """
+    out = []
+    in_list = False
+    pending_list_end = False
+    for line in text.splitlines():
+        if not line.strip():
+            out.append(line)
+            if in_list:
+                pending_list_end = True
+            continue
+
+        if pending_list_end:
+            if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line):
+                # Still inside the list block.
+                pass
+            else:
+                in_list = False
+            pending_list_end = False
+
+        # Mistune 0.x list parsing stops before reference definition lines, even
+        # without blank lines. Treat those as list terminators for normalization
+        # purposes.
+        if in_list and _REF_DEF_LINE_RE.match(line):
+            in_list = False
+
+        is_quote = bool(_BLOCKQUOTE_LINE_RE.match(line))
+        if is_quote and out and out[-1].strip() and not in_list:
+            out.append('')
+        out.append(line)
+
+        if _LIST_MARKER_RE.match(line):
+            in_list = True
+    return '\n'.join(out)
+
+
+def _normalize_fence_block_starts(text: str) -> str:
+    """Force mistune 0.x paragraph breaks before complete fence blocks.
+
+    Mistune 0.x's `paragraph` regex stops when a *complete* fence block (as defined
+    by its `fences` regex) starts on the next line. We disable fence parsing, but
+    still need the same paragraph splitting behavior for structural diffs.
+
+    We insert a blank line before any line that begins a fence block according to
+    the mistune 0.x `fences` regex.
+
+    NOTE: This is intentionally restricted to non-indented lines to avoid
+    perturbing list-item parsing; legacy list items don't use paragraph parsing
+    either (they tokenize as plain text).
+    """
+    if not text:
+        return text
+
+    insert_positions = set()
+    in_list = False
+    pending_list_end = False
+    prev_blank = True
+
+    offset = 0
+    for raw_line in text.splitlines(True):
+        line_start = offset
+        offset += len(raw_line)
+        line = raw_line[:-1] if raw_line.endswith('\n') else raw_line
+
+        if not line.strip():
+            prev_blank = True
+            if in_list:
+                pending_list_end = True
+            continue
+
+        if pending_list_end:
+            if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line):
+                # Still inside the list block.
+                pass
+            else:
+                in_list = False
+            pending_list_end = False
+
+        # Mistune 0.x list parsing stops before reference definition lines, even
+        # without blank lines. Treat those as list terminators for normalization
+        # purposes.
+        if in_list and _REF_DEF_LINE_RE.match(line):
+            in_list = False
+
+        if _LIST_MARKER_RE.match(line):
+            in_list = True
+
+        first = line[:1]
+        if not in_list and not prev_blank and first in {'`', '~'} and first not in {' ', '\t'}:
+            if _MISTUNE08_FENCE_BLOCK_RE.match(text, line_start):
+                insert_positions.add(line_start)
+
+        prev_blank = False
+
+    if not insert_positions:
+        return text
+
+    out = text
+    for start in sorted(insert_positions, reverse=True):
+        out = out[:start] + '\n' + out[start:]
+    return out
+
+
+def _normalize_fence_only_lines_start_new_paragraphs(text: str) -> str:
+    """Force fence-only lines to start new paragraphs like mistune 0.x.
+
+    The legacy parser breaks paragraphs when it encounters a fence-only marker line
+    (``` / ~~~) even though it doesn't parse fences as code blocks. Mistune 3 tends to
+    keep those markers inside a paragraph when fenced code parsing is disabled.
+    """
+    out = []
+    prev_was_blank = True
+    in_fence_paragraph = False
+    for line in text.splitlines(True):
+        if not line.strip():
+            out.append(line)
+            prev_was_blank = True
+            in_fence_paragraph = False
+            continue
+
+        is_fence = bool(_FENCE_ONLY_LINE_RE.match(line))
+        if is_fence and not prev_was_blank and not in_fence_paragraph:
+            out.append('\n')
+            prev_was_blank = True
+
+        out.append(line)
+        if prev_was_blank:
+            in_fence_paragraph = is_fence
+        prev_was_blank = False
+    return ''.join(out)
+
+
+def _merge_adjacent_lists(nodes):
+    """Merge directly-adjacent list blocks.
+
+    The legacy parser is quite permissive and tends to merge adjacent lists even
+    when bullet markers or orderedness changes. Normalizing this reduces spurious
+    structural diffs vs `master`.
+    """
+    merged = []
+    for node in nodes:
+        # Recurse first.
+        if getattr(node, 'nodes', None):
+            node.nodes = _merge_adjacent_lists(node.nodes)
+
+        if merged and isinstance(node, List) and isinstance(merged[-1], List):
+            merged[-1].add_nodes(node.nodes)
+            continue
+        merged.append(node)
+    return merged
+
+
 def parse(text, parser_cls: type[MdParser] = MdParser):
     """Parse Markdown into a Root node using the given parser class."""
     text = _remove_spaces_from_empty_lines(text)
     text = _remove_ltr_rtl_marks(text)
+    text = _normalize_atx_heading_spaces(text)
+    text = _normalize_double_blank_line_list_nesting(text)
+    text = _normalize_ordered_list_marker_interrupts(text)
+    text = _normalize_list_lazy_continuations(text)
+    text = _normalize_consecutive_blockquote_lines(text)
+    text = _normalize_fence_block_starts(text)
     parser = parser_cls()
     if hasattr(parser, '_set_reference_definitions'):
         text, reference_definitions = _extract_reference_definitions(text)
         parser._set_reference_definitions(reference_definitions)
     result = parser.parse(text)
     if isinstance(result, list):
-        return Root(result)
+        root = Root(result)
+        root.nodes = _merge_adjacent_lists(root.nodes)
+        return root
     return result
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 5d1c9c7..18f137e 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -69,9 +69,30 @@ def test_heading_text(self):
         actual = self._parse('### heading')
         self.assertEqual('heading', actual.nodes[0].nodes[0].text)
 
+    def test_heading_without_space_followed_by_text_parses_as_header(self):
+        actual = self._parse('##Heading\ntext')
+        self.assertEqual('2tpt', actual.print_all())
+
+    def test_heading_without_space_with_link_parses_as_header(self):
+        actual = self._parse('##[Verify email]({{url}})\ntext')
+        self.assertEqual('header', actual.nodes[0].name)
+        self.assertEqual(2, actual.nodes[0].level)
+        self.assertEqual('link', actual.nodes[0].nodes[0].name)
+        self.assertEqual('[Verify email]({{url}})', actual.nodes[0].nodes[0].text)
+
+    def test_heading_without_space_in_list_item_followed_by_text(self):
+        actual = self._parse('1. ##Heading\n   text')
+        self.assertEqual('lm2tt', actual.print_all())
+
     def test_link_wrapped_in_text(self):
         self._run_and_assert('some text [link](url) new text', 'ptat')
 
+    def test_text_before_link_not_duplicated(self):
+        actual = self._parse('some text and [link](url)')
+        paragraph = actual.nodes[0]
+        self.assertEqual(['text', 'link'], [node.name for node in paragraph.nodes])
+        self.assertEqual(['some text and '], [node.text for node in paragraph.nodes if node.name == 'text'])
+
     def test_link_label_with_codespan(self):
         actual = self._parse('[use `foo`](url)')
         self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text)
@@ -99,7 +120,8 @@ def test_reference_definition_inside_list_item_preserved(self):
         data = '- item\n  [id]: https://example.com'
         tree = self._parse(data)
         list_item = tree.nodes[0].nodes[0]
-        self.assertIn('[id]: https://example.com', list_item.nodes[0].text)
+        self.assertEqual('item', list_item.nodes[0].text)
+        self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text)
 
     def test_reference_links_with_whitespace_and_empty_id(self):
         data = 'See [API][] and [Ref] [id].\n\n[API]: https://example.com\n[id]: https://example.com'
@@ -114,7 +136,7 @@ def test_reference_definition_inside_fence_is_text(self):
 [link][id]
 ```"""
         tree = self._parse(data)
-        self.assertEqual('pt', tree.print_all())
+        self.assertEqual('ptttptattt', tree.print_all())
 
     def test_reference_definition_inside_long_fence_is_text(self):
         data = """````
@@ -122,7 +144,7 @@ def test_reference_definition_inside_long_fence_is_text(self):
 [link][id]
 ````"""
         tree = self._parse(data)
-        self.assertEqual('pt', tree.print_all())
+        self.assertEqual('pttttptatttt', tree.print_all())
 
     def test_softbreak_preserves_space(self):
         actual = self._parse('hello\nworld')
@@ -134,18 +156,40 @@ def test_block_quote_preserves_marker(self):
 
     def test_fenced_code_preserves_fences(self):
         actual = self._parse('```\ncode\n```')
-        self.assertEqual('```\ncode\n```', actual.nodes[0].nodes[0].text)
+        self.assertEqual('ptttttt', actual.print_all())
+        text = ''.join(node.text for node in actual.nodes[0].nodes)
+        self.assertTrue(text.startswith('```'))
+        self.assertTrue(text.endswith('```'))
 
     def test_ordered_list_parses_as_ordered(self):
         tree = self._parse('1. one\n2. two')
         list_node = tree.nodes[0]
         self.assertTrue(list_node.ordered)
 
+    def test_ordered_list_marker_other_than_1_interrupts_paragraph(self):
+        self._run_and_assert('para\n2. item\n', 'ptlmt')
+
+    def test_list_item_allows_unindented_heading_lazy_continuation(self):
+        tree = self._parse('* a\n###### b\n')
+        self.assertEqual(1, len(tree.nodes))
+        self.assertEqual('list', tree.nodes[0].name)
+        item = tree.nodes[0].nodes[0]
+        self.assertEqual(['text', 'header'], [node.name for node in item.nodes])
+        self.assertEqual('a', item.nodes[0].text)
+        self.assertEqual(6, item.nodes[1].level)
+        self.assertEqual('b', item.nodes[1].nodes[0].text)
+
     def test_unordered_list_parses_as_unordered(self):
         tree = self._parse('- one\n- two')
         list_node = tree.nodes[0]
         self.assertFalse(list_node.ordered)
 
+    def test_double_blank_lines_between_list_items_nests_next_list(self):
+        self._run_and_assert('* a\n\n\n* b\n', 'lmtlmt')
+
+    def test_double_blank_lines_between_ordered_list_items_nests_next_list(self):
+        self._run_and_assert('1. a\n\n\n1. b\n', 'lmtlmt')
+
 
 class TestZendeskParser(ParserTestCase):
     def setUp(self) -> None:
@@ -181,6 +225,22 @@ def test_callout_invalid_style(self):
         actual = self._parse(fixture)
         self.assertNotEqual(actual.nodes[0].name, 'callout')
 
+    def test_callout_invalid_style_does_not_swallow_trailing_closing_tag(self):
+        fixture = '<callout invalid>\n# title\ncontent\n</callout>\n</callout>\n'
+        self._run_and_assert(fixture, 'xpt')
+
+    def test_callout_tags_inside_list_item_are_text_and_allow_headings(self):
+        fixture = '1. item\n<callout>\n# title\ncontent\n</callout>\n'
+        tree = self._parse(fixture)
+        self.assertEqual(1, len(tree.nodes))
+        self.assertEqual('list', tree.nodes[0].name)
+        item = tree.nodes[0].nodes[0]
+        self.assertEqual(['text', 'text', 'header', 'text', 'text'], [node.name for node in item.nodes])
+        self.assertEqual('&lt;callout&gt;', item.nodes[1].text)
+        self.assertEqual(1, item.nodes[2].level)
+        self.assertEqual('title', item.nodes[2].nodes[0].text)
+        self.assertEqual('&lt;/callout&gt;', item.nodes[-1].text)
+
     def test_tabs(self):
         fixture = """
         <tabs>
@@ -192,12 +252,12 @@ def test_tabs(self):
         """
         self._run_and_assert(fixture, 'T1tpt1tpt')
 
-    def test_inline_callout_is_structural(self):
+    def test_inline_callout_is_not_structural(self):
         fixture = """intro <callout>
 # title
 content
 </callout> outro"""
-        self._run_and_assert(fixture, 'ptC1tptpt')
+        self._run_and_assert(fixture, 'pt1tpt')
 
     def test_zendesk_tags_inside_fenced_code_are_text(self):
         fixture = """```
@@ -214,9 +274,26 @@ def test_zendesk_tags_inside_fenced_code_are_text(self):
 </tabs>
 ```"""
         tree = self._parse(fixture)
-        self.assertEqual('pt', tree.print_all())
+        self.assertEqual('ptttxxxpttt', tree.print_all())
         self.assertFalse(any(node.name in {'callout', 'steps', 'tabs'} for node in tree.nodes))
 
+    def test_zendesk_tags_after_fenced_code_are_parsed(self):
+        fixture = """```
+<callout>
+# title
+content
+</callout>
+```
+
+<callout>
+# title
+content
+</callout>
+"""
+        tree = self._parse(fixture)
+        self.assertTrue(any(node.name == 'callout' for node in tree.nodes))
+        self.assertEqual(1, tree.print_all().count('C'))
+
     def test_steps(self):
         steps_fixture = """
         <steps>
diff --git a/tests/test_sdiff.py b/tests/test_sdiff.py
index db8bf45..55d3079 100644
--- a/tests/test_sdiff.py
+++ b/tests/test_sdiff.py
@@ -63,6 +63,18 @@ def test_softbreaks_ignored_in_structure(self):
         _, _, errors = sdiff.diff(left, right)
         self.assertEqual([], errors)
 
+    def test_heading_without_space_matches_heading_with_space(self):
+        left = '##Heading\ntext'
+        right = '## Heading\ntext'
+        _, _, errors = sdiff.diff(left, right)
+        self.assertEqual([], errors)
+
+    def test_list_heading_without_space_matches_heading_with_space(self):
+        left = '1. ##Heading\n   text'
+        right = '1. ## Heading\n   text'
+        _, _, errors = sdiff.diff(left, right)
+        self.assertEqual([], errors)
+
     def test_reference_definition_missing_is_reported(self):
         left = 'See [API][id].\n\n[id]: https://example.com'
         right = 'See [API][id].'
@@ -78,3 +90,22 @@ def test_code_block_content_ignored_in_structure(self):
 ```"""
         _, _, errors = sdiff.diff(left, right)
         self.assertEqual([], errors)
+
+    def test_invalid_callout_followed_by_fence_does_not_depend_on_blank_line(self):
+        left = """<callout invalid>
+# title
+content
+</callout>
+
+```
+code
+```"""
+        right = """<callout invalid>
+# title
+content
+</callout>
+```
+code
+```"""
+        _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser)
+        self.assertEqual([], errors)

From 2f68918016b7aeb96d886b19771f86b1c25734bd Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Fri, 6 Feb 2026 12:16:10 -0800
Subject: [PATCH 15/18] tests: cover inline buffer flush duplication

---
 tests/test_parser.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index 18f137e..965e8f5 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,6 +1,7 @@
 from unittest import TestCase
 from sdiff import parser, MdParser, ZendeskHelpMdParser
 from sdiff.model import Paragraph, Root, Text, ZendeskHelpSteps
+from sdiff.renderer import TextRenderer
 
 
 class ParserTestCase(TestCase):
@@ -87,6 +88,33 @@ def test_heading_without_space_in_list_item_followed_by_text(self):
     def test_link_wrapped_in_text(self):
         self._run_and_assert('some text [link](url) new text', 'ptat')
 
+    def test_link_with_trailing_text_does_not_duplicate_buffer(self):
+        actual = self._parse('some text [link](url) new text')
+        paragraph = actual.nodes[0]
+        self.assertEqual(['text', 'link', 'text'], [node.name for node in paragraph.nodes])
+        self.assertEqual('some text ', paragraph.nodes[0].text)
+        self.assertEqual('[link](url)', paragraph.nodes[1].text)
+        self.assertEqual(' new text', paragraph.nodes[2].text)
+
+    def test_image_with_trailing_text_does_not_duplicate_buffer(self):
+        actual = self._parse('some ![alt](url) new')
+        paragraph = actual.nodes[0]
+        self.assertEqual(['text', 'image', 'text'], [node.name for node in paragraph.nodes])
+        self.assertEqual('some ', paragraph.nodes[0].text)
+        self.assertEqual('![alt](url)', paragraph.nodes[1].text)
+        self.assertEqual(' new', paragraph.nodes[2].text)
+
+    def test_inline_marker_does_not_duplicate_buffer(self):
+        actual = self._parse('some **bold** text')
+        self.assertEqual('some **bold** text', TextRenderer().render(actual))
+
+    def test_inline_linebreak_does_not_duplicate_buffer(self):
+        actual = self._parse('a\\\nb')
+        paragraph = actual.nodes[0]
+        self.assertEqual(['text', 'new-line', 'text'], [node.name for node in paragraph.nodes])
+        self.assertEqual('a', paragraph.nodes[0].text)
+        self.assertEqual('b', paragraph.nodes[2].text)
+
     def test_text_before_link_not_duplicated(self):
         actual = self._parse('some text and [link](url)')
         paragraph = actual.nodes[0]

From 4e447aef3aad1a0e0de75d41d0cafdb8e2ff0668 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Fri, 6 Feb 2026 13:23:33 -0800
Subject: [PATCH 16/18] chore: align flake8 config with repo defaults

---
 .flake8 | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.flake8 b/.flake8
index aac8e09..7148fd6 100644
--- a/.flake8
+++ b/.flake8
@@ -1,8 +1,6 @@
 [flake8]
 max-line-length = 120
-max-complexity = 12
-select = E,F,W,C90
-extend-ignore = F403,F405
+ignore = F403,F405
 exclude =
     .git,
     __pycache__,

From 2684d49306f1dfa25a96927bff7fb01cd881b95a Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Fri, 6 Feb 2026 13:23:39 -0800
Subject: [PATCH 17/18] tests: raise coverage for parser and helpers

---
 tests/test_coverage.py | 416 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 416 insertions(+)
 create mode 100644 tests/test_coverage.py

diff --git a/tests/test_coverage.py b/tests/test_coverage.py
new file mode 100644
index 0000000..92e88c4
--- /dev/null
+++ b/tests/test_coverage.py
@@ -0,0 +1,416 @@
+from unittest import TestCase
+
+import sdiff.compare as compare_mod
+from sdiff import MdParser, parser
+from sdiff.compare import diff_struct
+from sdiff.errors import InsertError
+from sdiff.model import Header, Link, List, ListItem, NewLine, Root, Text, ZendeskHelpCallout
+from sdiff.renderer import TextRenderer
+from tests.fixtures import trees
+
+
+class TestCoverageMisc(TestCase):
+    def test_diff_error_str_uses_message(self):
+        err = InsertError(Text("x"))
+        self.assertIn("missing element", str(err))
+
+    def test_node_str_repr_and_eq(self):
+        node = Root([Text("x")])
+        self.assertTrue(str(node))
+        self.assertIn("root", repr(node))
+        self.assertNotEqual(node, "not-a-node")
+
+    def test_header_str_repr(self):
+        header = Header(3, [Text("x")])
+        self.assertEqual("3", str(header))
+        self.assertIn("level", repr(header))
+
+    def test_list_and_link_repr_and_eq_branches(self):
+        self.assertFalse(List(False) == "nope")  # noqa: E711
+        self.assertIn("ordered", repr(List(False)))
+        self.assertIn("link", repr(Link("x")))
+        self.assertIn("new-line", repr(NewLine()))
+        self.assertIn("callout", repr(ZendeskHelpCallout("green")))
+        self.assertFalse(ZendeskHelpCallout("green") == "nope")  # noqa: E711
+
+    def test_fixture_empty_tree(self):
+        self.assertEqual("", trees.empty_tree().print_all())
+
+    def test_diff_struct_ignores_single_space_nodes(self):
+        # Cover the "ignore single space errors" branch in compare.py.
+        tree1 = Root([Text(" "), Text("x")])
+        tree2 = Root([Text("x")])
+        _, _, errors = diff_struct(tree1, tree2)
+        self.assertEqual(0, len(errors))
+
+    def test_apply_diff_ranges_ignores_single_space_nodes(self):
+        # Cover the "ignore single space errors" branches in compare.py explicitly.
+        delete_only = [("x", 0, 1, 0, 0)]
+        insert_only = [("x", 0, 0, 0, 1)]
+
+        errors = compare_mod._apply_diff_ranges(delete_only, [Text(" ")], [])
+        self.assertEqual([], errors)
+
+        errors = compare_mod._apply_diff_ranges(insert_only, [], [Text(" ")])
+        self.assertEqual([], errors)
+
+        errors = compare_mod._apply_diff_ranges(delete_only, [Text("x")], [])
+        self.assertEqual(1, len(errors))
+        self.assertIn("additional element", str(errors[0]))
+
+        errors = compare_mod._apply_diff_ranges(insert_only, [], [Text("x")])
+        self.assertEqual(1, len(errors))
+        self.assertIn("missing element", str(errors[0]))
+
+
+class TestCoverageParserHelpers(TestCase):
+    def test_split_legacy_block_html_variants(self):
+        self.assertIsNone(parser._split_legacy_block_html(""))
+        self.assertIsNone(parser._split_legacy_block_html("not html\n"))
+
+        # Exact match should return None (no suffix to split).
+        self.assertIsNone(parser._split_legacy_block_html("<div>hi</div>\n"))
+
+        prefix, suffix = parser._split_legacy_block_html("<div>hi</div>\n\nnext")
+        self.assertTrue(prefix.startswith("<div>hi</div>"))
+        self.assertEqual("next", suffix)
+
+    def test_block_parser_disabled_rules_return_none(self):
+        block = parser._SdiffBlockParser()
+        self.assertIsNone(block.parse_fenced_code(None, None))
+        self.assertIsNone(block.parse_block_quote(None, None))
+
+    def test_mdparser_get_lexer_returns_instance(self):
+        self.assertIsInstance(MdParser.get_lexer(), MdParser)
+
+    def test_split_text_on_legacy_markers(self):
+        self.assertEqual([], parser._split_text_on_legacy_markers(""))
+        self.assertEqual(["a", "`b", "`c"], parser._split_text_on_legacy_markers("a`b`c"))
+
+    def test_unquote_url_if_template(self):
+        url = "https://example.com/%7B%7Burl%7D%7D"
+        self.assertIn("{{url}}", parser._unquote_url_if_template(url))
+        # Percent-encoded but not template-like => keep as-is.
+        self.assertEqual("https://example.com/%2F", parser._unquote_url_if_template("https://example.com/%2F"))
+
+    def test_is_block_html(self):
+        self.assertTrue(parser._is_block_html("<!-- hi -->"))
+        self.assertFalse(parser._is_block_html("<sub>text</sub>"))
+        self.assertTrue(parser._is_block_html("<div>text</div>"))
+        self.assertFalse(parser._is_block_html("nope"))
+
+    def test_normalize_block_indentation(self):
+        # Only non-HTML lines should be considered for min-indent normalization.
+        raw = "    <div>\n        x\n    </div>\n        y"
+        normalized = parser._normalize_block_indentation(raw)
+        self.assertIn("y", normalized)
+
+    def test_extract_reference_definitions_fence_special_case(self):
+        raw = "[id]: https://example.com\n```\n\n```"
+        text, defs = parser._extract_reference_definitions(raw)
+        self.assertEqual(1, len(defs))
+        # The special-case inserts a blank line after the placeholder.
+        self.assertTrue(text.startswith("SDIFF_REF_DEF_0\n\n"))
+
+    def test_extract_reference_definitions_fence_special_case_not_triggered_without_blank_line(self):
+        raw = "[id]: https://example.com\n```\n```"
+        text, defs = parser._extract_reference_definitions(raw)
+        self.assertEqual(1, len(defs))
+        self.assertEqual("SDIFF_REF_DEF_0\n```\n```", text)
+
+    def test_is_inside_fenced_block(self):
+        raw = "```\ncode\n```\noutside"
+        # Offset inside "code".
+        self.assertTrue(parser._is_inside_fenced_block(raw, raw.index("code")))
+        # Offset inside "outside".
+        self.assertFalse(parser._is_inside_fenced_block(raw, raw.index("outside")))
+        # Offset past end => fall through.
+        self.assertFalse(parser._is_inside_fenced_block(raw, len(raw) + 1))
+
+    def test_is_inside_list_block(self):
+        raw = "- a\n  b\n\nc"
+        self.assertTrue(parser._is_inside_list_block(raw, raw.index("b")))
+        self.assertFalse(parser._is_inside_list_block(raw, raw.index("c")))
+        # Offset past end => fall through.
+        self.assertFalse(parser._is_inside_list_block(raw, len(raw) + 1))
+
+    def test_normalize_consecutive_fence_lines(self):
+        raw = "```\n```\ntext"
+        normalized = parser._normalize_consecutive_fence_lines(raw)
+        self.assertIn("```\n\n```", normalized)
+
+    def test_normalize_consecutive_blockquote_lines(self):
+        raw = "> a\n> b\nc"
+        normalized = parser._normalize_consecutive_blockquote_lines(raw)
+        self.assertIn("> a\n\n> b", normalized)
+
+    def test_normalize_fence_only_lines_start_new_paragraphs(self):
+        raw = "a\n```\nb"
+        normalized = parser._normalize_fence_only_lines_start_new_paragraphs(raw)
+        self.assertIn("a\n\n```", normalized)
+        # Blank line resets state.
+        normalized = parser._normalize_fence_only_lines_start_new_paragraphs("a\n\n```\n\n```")
+        self.assertIn("\n\n```\n\n```", normalized)
+
+    def test_normalize_double_blank_line_list_nesting_does_not_overindent(self):
+        raw = "* a\n\n\n    * b\n"
+        normalized = parser._normalize_double_blank_line_list_nesting(raw)
+        self.assertEqual(raw, normalized)
+
+    def test_merge_adjacent_lists(self):
+        l1 = List(False, [ListItem([Text("a")])])
+        l2 = List(True, [ListItem([Text("b")])])
+        root = Root([l1, l2])
+        merged = parser._merge_adjacent_lists(root.nodes)
+        self.assertEqual(1, len(merged))
+        self.assertEqual(2, len(merged[0].nodes))
+
+    def test_parse_passthrough_when_parser_returns_non_list(self):
+        class _Dummy(MdParser):
+            def parse(self, text, rules=None):  # noqa: ANN001
+                return Root([Text("x")])
+
+        parsed = parser.parse("x", parser_cls=_Dummy)
+        self.assertIsInstance(parsed, Root)
+
+
+class TestCoverageParserConversions(TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+        self.p = MdParser()
+
+    def test_convert_block_token_branches(self):
+        item = self.p._convert_block_token(
+            {
+                "type": "list_item",
+                "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}],
+            }
+        )[0]
+        self.assertEqual("list-item", item.name)
+
+        block_text = self.p._convert_block_token({"type": "block_text", "children": [{"type": "text", "raw": "x"}]})[0]
+        self.assertEqual("paragraph", block_text.name)
+
+        quote = self.p._convert_block_token(
+            {"type": "block_quote", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "q"}]}]}
+        )[0]
+        self.assertEqual("paragraph", quote.name)
+        self.assertIn("&gt;", quote.nodes[0].text)
+
+        code = self.p._convert_block_token({"type": "block_code", "raw": "code\n", "marker": "```"})[0]
+        self.assertTrue(code.nodes[0].text.startswith("```"))
+
+    def test_convert_list_ordered_attr_fallback(self):
+        lst = self.p._convert_list({"type": "list", "attrs": {"ordered": True}, "children": []})
+        self.assertTrue(lst.ordered)
+
+    def test_convert_block_html_with_suffix(self):
+        token = {"type": "block_html", "raw": "<div>hi</div>\n\ntext"}
+        nodes = self.p._convert_block_html(token)
+        self.assertEqual("html", nodes[0].name)
+        self.assertEqual("paragraph", nodes[1].name)
+
+        # Split happens, but suffix is whitespace-only => no extra nodes.
+        token = {"type": "block_html", "raw": "<div>hi</div>\n\n   "}
+        nodes = self.p._convert_block_html(token)
+        self.assertEqual(1, len(nodes))
+
+        # Whitespace-only raw => empty conversion.
+        self.assertEqual([], self.p._convert_block_html({"type": "block_html", "raw": "  "}))
+
+    def test_convert_passthrough_block_children_and_raw(self):
+        out = self.p._convert_passthrough_block(
+            {"type": "unknown", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}]}
+        )
+        self.assertEqual("paragraph", out[0].name)
+        out2 = self.p._convert_passthrough_block({"type": "unknown", "raw": "raw"})
+        self.assertEqual("paragraph", out2[0].name)
+
+    def test_convert_block_quote_early_returns(self):
+        self.assertEqual([], self.p._convert_block_quote({"type": "block_quote", "children": []}))
+        self.assertEqual(
+            [],
+            self.p._convert_block_quote({"type": "block_quote", "children": [{"type": "paragraph", "children": []}]}),
+        )
+
+    def test_render_inline_children_unknown_child_type(self):
+        out = self.p._render_inline_children([{"type": "thematic_break", "raw": "---"}])
+        self.assertEqual("---", out)
+
+    def test_inline_other_and_codespan_text_fallback(self):
+        tokens = [{"type": "codespan", "text": "x"}, {"type": "unknown", "raw": "<x>"}]
+        out = self.p._convert_inline_tokens(tokens)
+        self.assertEqual("`x`&lt;x&gt;", "".join(node.text for node in out))
+
+    def test_inline_marker_without_children_and_inline_other_with_children(self):
+        out = self.p._convert_inline_tokens([{"type": "strong", "children": []}])
+        self.assertEqual(["text", "text"], [n.name for n in out])
+
+        out = self.p._convert_inline_tokens([{"type": "unknown", "children": [{"type": "text", "raw": "x"}]}])
+        self.assertEqual("x", out[0].text)
+
+        out = self.p._convert_inline_tokens([{"type": "unknown", "raw": " "}])
+        self.assertEqual([], out)
+
+    def test_flatten_inline_text_unknown_branches(self):
+        text = self.p._flatten_inline_text(
+            [
+                {"type": "codespan", "raw": "x"},
+                {"type": "unknown", "children": [{"type": "text", "raw": "y"}]},
+                {"type": "unknown", "raw": "z"},
+            ]
+        )
+        self.assertIn("`x`", text)
+        self.assertTrue(text.endswith("z"))
+
+    def test_flatten_inline_markup_link_and_image(self):
+        tokens = [
+            {"type": "text", "raw": "a"},
+            {"type": "softbreak"},
+            {"type": "link", "children": [{"type": "text", "raw": "L"}], "attrs": {"url": "%7B%7Burl%7D%7D"}},
+            {"type": "softbreak"},
+            {"type": "image", "children": [{"type": "text", "raw": "A"}], "attrs": {"url": "u", "title": 't"'}},
+        ]
+        s = self.p._flatten_inline_markup(tokens, softbreak_as_newline=True)
+        self.assertIn("[L]({{url}})", s)
+        self.assertIn('![A](u "t\\"")', s)
+
+    def test_flatten_inline_markup_unknown_branches(self):
+        tokens = [
+            {"type": "unknown", "children": [{"type": "text", "raw": "x"}]},
+            {"type": "unknown", "raw": "y"},
+        ]
+        s = self.p._flatten_inline_markup(tokens)
+        self.assertEqual("xy", s)
+
+    def test_convert_list_block_nodes_ref_heading_and_text(self):
+        self.p._set_reference_definitions(
+            {
+                "SDIFF_REF_DEF_0": "[id]: https://example.com",
+                "[id]: https://example.com": "[id]: https://example.com",
+            }
+        )
+        tokens = [
+            {"type": "text", "raw": "SDIFF_REF_DEF_0"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "###header"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": " "},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "plain"},
+        ]
+        nodes = self.p._convert_list_block_nodes(tokens)
+        self.assertEqual(["text", "header", "text"], [n.name for n in nodes])
+
+    def test_convert_list_block_nodes_empty(self):
+        self.assertEqual([], self.p._convert_list_block_nodes([]))
+
+    def test_heading_from_inline_fallback_branch(self):
+        class _NoHeading(MdParser):
+            def __init__(self):
+                super().__init__()
+                self._markdown = lambda _: [{"type": "paragraph", "children": []}]  # noqa: E731
+
+        p = _NoHeading()
+        heading = p._heading_from_inline([{"type": "text", "raw": "###header"}])
+        self.assertEqual("header", heading.name)
+        self.assertEqual("text", heading.nodes[0].name)
+
+    def test_convert_paragraph_or_heading_ref_and_heading(self):
+        self.p._set_reference_definitions({"SDIFF_REF_DEF_0": "[id]: https://example.com"})
+        node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "SDIFF_REF_DEF_0"}])
+        self.assertEqual("paragraph", node.name)
+
+        node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "###header"}])
+        self.assertEqual("header", node.name)
+
+        node = self.p._convert_paragraph_token([{"type": "text", "raw": "###header"}])[0]
+        self.assertEqual("header", node.name)
+
+    def test_split_paragraph_inline_on_fence_variants(self):
+        self.assertIsNone(self.p._split_paragraph_inline_on_fence([]))
+        self.assertIsNone(self.p._split_paragraph_inline_on_fence([{"type": "text", "raw": "x"}]))
+
+        # First line is a fence-only marker => do not split.
+        tokens = [{"type": "text", "raw": "```"}, {"type": "softbreak"}, {"type": "text", "raw": "x"}]
+        self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens))
+
+        # Tail is fence markers but not a complete fence block => do not split.
+        tokens = [
+            {"type": "text", "raw": "a"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "```"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "```"},
+        ]
+        self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens))
+
+        # Complete fence block tail => split.
+        tokens = [
+            {"type": "text", "raw": "a"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "```"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "code"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "```"},
+        ]
+        parts = self.p._split_paragraph_inline_on_fence(tokens)
+        self.assertEqual(2, len(parts))
+
+        nodes = self.p._convert_paragraph_token(tokens)
+        self.assertEqual(2, len(nodes))
+
+    def test_split_paragraph_inline_on_fence_first_part_includes_seps(self):
+        tokens = [
+            {"type": "text", "raw": "a"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "b"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "```"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "code"},
+            {"type": "softbreak"},
+            {"type": "text", "raw": "```"},
+        ]
+        parts = self.p._split_paragraph_inline_on_fence(tokens)
+        self.assertEqual(2, len(parts))
+
+    def test_convert_list_item_block_html_text_smoke(self):
+        # Exercise conversion of text following a (hypothetical) HTML block inside a list item.
+        nodes = self.p._convert_list_item_block_html_text("text\n\n# h\n\n- a\n")
+        self.assertTrue(any(n.name == "header" for n in nodes))
+        self.assertTrue(any(n.name == "list" for n in nodes))
+
+    def test_convert_list_item_with_block_html_child(self):
+        token = {
+            "type": "list_item",
+            "children": [
+                {"type": "block_html", "raw": "<div>hi</div>"},
+            ],
+        }
+        item = self.p._convert_list_item(token)
+        self.assertTrue(item.nodes)
+
+    def test_convert_list_item_block_html_variants(self):
+        self.assertEqual([], self.p._convert_list_item_block_html({"type": "block_html", "raw": "  "}))
+
+        nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "not html\n"})
+        self.assertTrue(nodes)
+
+        nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "<div>hi</div>\n\n   "})
+        self.assertTrue(nodes)
+
+    def test_convert_list_item_block_html_text_with_block_html_and_raw(self):
+        nodes = self.p._convert_list_item_block_html_text("<div>hi</div>\n\n---\n")
+        self.assertTrue(any(n.name == "text" for n in nodes))
+
+    def test_convert_list_item_block_html_smoke(self):
+        token = {"type": "block_html", "raw": "<div>hi</div>\n\ntext"}
+        nodes = self.p._convert_list_item_block_html(token)
+        self.assertTrue(any(isinstance(n, Text) for n in nodes))
+
+    def test_rendering_roundtrip_smoke(self):
+        md = "some text [link](url) new text"
+        tree = parser.parse(md, parser_cls=MdParser)
+        self.assertEqual(md, TextRenderer().render(tree))

From ada6821b16f56c20490e8d2f44879c75d902cad3 Mon Sep 17 00:00:00 2001
From: Philipp Berner <374326+philippb@users.noreply.github.com>
Date: Fri, 6 Feb 2026 13:30:29 -0800
Subject: [PATCH 18/18] chore: ignore node_modules

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index ce77503..8f655ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,4 @@ target/
 venv/
 .DS_Store
 .idea/
+node_modules/