diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..7148fd6
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,10 @@
+[flake8]
+max-line-length = 120
+ignore = F403,F405
+exclude =
+ .git,
+ __pycache__,
+ venv,
+ build,
+ dist,
+ sdiff.egg-info
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000..bf9a1e8
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,37 @@
+name: CI
+
+on:
+ workflow_dispatch:
+ pull_request:
+ types: [opened, synchronize, reopened, ready_for_review]
+ push:
+ branches: [master]
+
+jobs:
+ test:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: Set up Python
+ uses: actions/setup-python@v5
+ with:
+ python-version: "3.11"
+ cache: "pip"
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ python -m pip install .[tests]
+
+ - name: Format check
+ run: python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
+
+ - name: Lint
+ run: python -m flake8 --config .flake8 sdiff tests
+
+ - name: Test
+ run: python -m coverage run -m pytest -s --durations=3 --durations-min=0.005
+
+ - name: Coverage report
+ run: python -m coverage report -m
diff --git a/.gitignore b/.gitignore
index ce77503..8f655ec 100644
--- a/.gitignore
+++ b/.gitignore
@@ -56,3 +56,4 @@ target/
venv/
.DS_Store
.idea/
+node_modules/
diff --git a/.husky/pre-commit b/.husky/pre-commit
new file mode 100755
index 0000000..bc7696e
--- /dev/null
+++ b/.husky/pre-commit
@@ -0,0 +1,5 @@
+#!/usr/bin/env sh
+. "$(dirname -- "$0")/_/husky.sh"
+
+python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
+python -m flake8 --config .flake8 sdiff tests
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index df31221..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,11 +0,0 @@
-language: python
-dist: jammy
-python:
- - "3.11"
-# command to install dependencies
-install:
- - make dev
-# command to run tests
-script:
- - make test
- - make coverage
diff --git a/AGENTS.md b/AGENTS.md
new file mode 100644
index 0000000..526549e
--- /dev/null
+++ b/AGENTS.md
@@ -0,0 +1,32 @@
+# Repository Guidelines
+
+## Project Structure & Module Organization
+The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tests are in `tests/`, with shared fixtures in `tests/fixtures/`. Reference PDFs sit in `docs/`. Packaging and tooling are defined in `setup.py`, `setup.cfg`, and the `Makefile`; `CHANGELOG` tracks releases.
+
+## Build, Test, and Development Commands
+- `make env` creates the local `venv/` (Python 3.11+).
+- `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv.
+- `make test` runs linting and the full pytest suite with coverage.
+- `make vtest` runs pytest verbosely.
+- `make flake` runs the autopep8 format check and flake8 on `sdiff/` and `tests/`.
+- `make format` applies autopep8 formatting to `sdiff/` and `tests/`.
+- `make cov` prints the coverage report.
+- `make clean` removes build artifacts and the venv.
+- `make hooks` installs Husky git hooks (requires Node/npm; `make dev` runs this).
+
+Lint parity: CI and the Husky pre-commit hook both run the same checks as `make flake` (autopep8 check + flake8). Run `make flake` or `make test` locally to mirror CI.
+
+Example flow:
+```sh
+make dev
+make test
+```
+
+## Coding Style & Naming Conventions
+Use standard Python conventions: 4-space indentation, `snake_case` for modules/functions/variables, and `PascalCase` for classes. Flake8 enforces a 120-character line limit (see `setup.cfg`). `autopep8` is available for formatting. Keep new modules in `sdiff/` and new tests in `tests/` with filenames like `test_.py`.
+
+## Testing Guidelines
+The suite uses `pytest` with `coverage`. Coverage is expected to stay high (current config fails under 96%). Add or update tests for behavior changes, and prefer small, focused unit tests. Place reusable data in `tests/fixtures/`. Run `make test` before submitting changes.
+
+## Commit & Pull Request Guidelines
+Commit messages in this repo are short and often use a type prefix (e.g., `chore: ...`, `fixes: ...`, `hotfix: ...`, `refactors: ...`). Follow that pattern where practical, and keep the summary concise. For PRs, include a brief description, list tests run (e.g., `make test`), and link related issues or tickets when available.
diff --git a/Makefile b/Makefile
index 6eeb1e2..4be00c9 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,7 @@ env:
dev: env update
$(PIP) install .[tests,devtools]
+ @$(MAKE) hooks
install: env update
@@ -28,8 +29,20 @@ publish:
$(TWINE) upload --verbose --sign --username developer --repository-url http://$(PYPICLOUD_HOST)/simple/ dist/*.whl
flake:
+ $(PYTHON) -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests
$(FLAKE) sdiff tests
+format:
+ $(PYTHON) -m autopep8 --in-place --max-line-length 120 -r sdiff tests
+
+hooks:
+ @if command -v npm >/dev/null 2>&1; then \
+ npm install --no-package-lock --silent; \
+ npm run --silent prepare; \
+ else \
+ echo "npm not found; skipping husky install"; \
+ fi
+
test: flake
$(COVERAGE) run -m pytest $(TEST_RUNNER_FLAGS)
@@ -57,4 +70,4 @@ clean:
rm -rf venv
-.PHONY: all build env linux run pep test vtest testloop cov clean
+.PHONY: all build env linux run pep test vtest testloop cov clean hooks format
diff --git a/README.md b/README.md
index b8bb2a8..7ab5d32 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,40 @@
# md-sdiff
-Diffs to markdown texts only based on their structure. Ignores content. Helpful to diff 2 files that contain the same content in different languages.
+
+Structural diffs for Markdown. The library parses two Markdown inputs into a lightweight tree and compares the *shape* (headings, lists, paragraphs, links, etc.) instead of the text content. This is useful when you expect the same document structure across translations or when you want to validate formatting consistency without caring about the wording.
+
+## What it does
+- Parses Markdown into an AST-like node tree using `mistune`.
+- Compares trees node-by-node and flags insertions/deletions in structure.
+- Returns a rendered view of each document plus a list of structural errors.
+- Supports a Zendesk-specific parser (`ZendeskHelpMdParser`) for ``, ``, and `` blocks.
+
+## Example usage
+```python
+from sdiff import diff, TextRenderer, MdParser
+
+left = "# Title\n\n- One\n- Two"
+right = "# Title\n\n- One\n- Two\n- Three"
+
+rendered_left, rendered_right, errors = diff(left, right, renderer=TextRenderer(), parser_cls=MdParser)
+print(errors[0]) # "There is a missing element `li`."
+```
+
+## Renderers
+`TextRenderer` returns the original Markdown structure as text. `HtmlRenderer` wraps the output and marks structural insertions/deletions with `` and ``.
+
+## One-off usage
+```sh
+python - <<'PY'
+from sdiff import diff, TextRenderer
+
+left = open("left.md", "r", encoding="utf-8").read()
+right = open("right.md", "r", encoding="utf-8").read()
+_, _, errors = diff(left, right, renderer=TextRenderer())
+
+for err in errors:
+ print(err)
+PY
+```
+
+## Notes
+This project is a library (no CLI). If you need different token handling, you can provide a custom parser class that extends `MdParser`.
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..d682872
--- /dev/null
+++ b/package.json
@@ -0,0 +1,10 @@
+{
+ "name": "html-structure-diff",
+ "private": true,
+ "devDependencies": {
+ "husky": "^9.0.0"
+ },
+ "scripts": {
+ "prepare": "husky install"
+ }
+}
diff --git a/requirements.txt b/requirements.txt
index 1f202e5..a234623 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-mistune==0.8.1
+mistune==3.2.0
diff --git a/sdiff/__init__.py b/sdiff/__init__.py
index 853d12c..85b6af4 100644
--- a/sdiff/__init__.py
+++ b/sdiff/__init__.py
@@ -4,13 +4,21 @@
def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParser):
+ """Compare two Markdown strings by structure and return rendered outputs + errors.
+
+ Args:
+ md1: Left Markdown string.
+ md2: Right Markdown string.
+ renderer: Renderer instance used to format the output (TextRenderer by default).
+ parser_cls: Parser class to use (MdParser by default).
+
+ Returns:
+ (rendered_left, rendered_right, errors)
+ """
tree1 = parse(md1, parser_cls)
tree2 = parse(md2, parser_cls)
tree1, tree2, struct_errors = diff_struct(tree1, tree2)
- # tree1, tree2, links_errors = diff_links(tree1, tree2)
-
- # errors = struct_errors + links_errors
errors = struct_errors
return renderer.render(tree1), renderer.render(tree2), errors
diff --git a/sdiff/compare.py b/sdiff/compare.py
index 5958ada..34d75ca 100644
--- a/sdiff/compare.py
+++ b/sdiff/compare.py
@@ -44,8 +44,10 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None):
def diff_links(tree1, tree2):
+ """Diff only link-relevant structure (paragraphs/headers/lists/links)."""
return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a'])
def diff_struct(tree1, tree2):
+ """Diff overall structure, ignoring link and image content."""
return _diff(tree1, tree2, exclude_symbols=['a', 'i'])
diff --git a/sdiff/parser.py b/sdiff/parser.py
index 93a4736..765ef12 100644
--- a/sdiff/parser.py
+++ b/sdiff/parser.py
@@ -1,207 +1,1162 @@
-from re import Match
+import re
+import textwrap
+from typing import Iterable
+from urllib.parse import unquote
import mistune
-import re
+from mistune import block_parser
-from .model import *
+from .model import (Html, Image, Link, List, ListItem, NewLine, Paragraph, Root,
+ Text, Header, ZendeskHelpCallout, ZendeskHelpSteps,
+ ZendeskHelpTabs)
+_HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)')
+_ATX_HEADING_NO_SPACE_RE = re.compile(r'^(\s{0,3})(#{1,6})(?!#)(?=\S)')
+_LIST_ITEM_ATX_HEADING_NO_SPACE_RE = re.compile(r'^(\s{0,3}(?:[*+-]|\d+[.)])\s+)(#{1,6})(?!#)(?=\S)')
+_LIST_MARKER_RE = re.compile(r'^\s{0,3}(?:[*+-]|\d+[.)])\s+')
+_ORDERED_LIST_MARKER_RE = re.compile(r'^\s{0,3}(\d+)[.)]\s+')
+_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]')
+_REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+')
+_FENCE_RE = re.compile(r'^\s*(`{3,}|~{3,})')
+_FENCE_ONLY_LINE_RE = re.compile(r'^\s*(`{3,}|~{3,})\s*$')
+_BLOCKQUOTE_LINE_RE = re.compile(r'^\s{0,3}>\s?.*')
+_MISTUNE08_FENCE_BLOCK_RE = re.compile(
+ r'^ *(`{3,}|~{3,}) *(\S+)? *\n' # opening fence (+ optional info)
+ r'([\s\S]+?)\s*' # content (must be non-empty; mistune 0.x quirk)
+ r'\1 *(?:\n+|$)', # closing fence
+ flags=re.M,
+)
+_INLINE_MARKERS = {
+ 'strong': '**',
+ 'emphasis': '*',
+ 'strikethrough': '~~',
+}
-class InlineLexer(mistune.BlockLexer):
- grammar_class = mistune.InlineGrammar
+_LEGACY_INLINE_TAGS = {
+ # Copied from mistune 0.8.1's `_block_tag` negative lookahead.
+ 'a',
+ 'em',
+ 'strong',
+ 'small',
+ 's',
+ 'cite',
+ 'q',
+ 'dfn',
+ 'abbr',
+ 'data',
+ 'time',
+ 'code',
+ 'var',
+ 'samp',
+ 'kbd',
+ 'sub',
+ 'sup',
+ 'i',
+ 'b',
+ 'u',
+ 'mark',
+ 'ruby',
+ 'rt',
+ 'rp',
+ 'bdi',
+ 'bdo',
+ 'span',
+ 'br',
+ 'wbr',
+ 'ins',
+ 'del',
+ 'img',
+ 'font',
+}
- default_rules = [
- 'linebreak', 'link',
- 'reflink', 'text',
- ]
+_MISTUNE_BLOCK_OR_PRE_TAGS = set(block_parser.BLOCK_TAGS) | set(block_parser.PRE_TAGS)
- def __init__(self):
- self.links = {}
- self.grammar_class.text = re.compile(r'^ {1,}\n|^[\s\S]+?(?=[\[`~]| {2,}\n|$)')
- super().__init__()
-
- def parse_autolink(self, m):
- self.tokens.append(Link(m.group(0)))
-
- def parse_url(self, m):
- self.tokens.append(Link(m.group(0)))
-
- def parse_link(self, m):
- return self._process_link(m)
-
- def parse_reflink(self, m):
- # TODO skip this check for now
- # key = mistune._keyify(m.group(2) or m.group(1))
- # if key not in self.links:
- # return None
- # ret = self.links[key]
- return self._process_link(m)
-
- def _process_link(self, m):
- line = m.group(0)
- if line[0] == '!':
- node = Image(line)
- else:
- node = Link(line)
+_LEGACY_VALID_ATTR_RE = r"\s*[a-zA-Z\-](?:\=(?:\"[^\"]*\"|'[^']*'|[^\s'\">]+))?"
+_LEGACY_BLOCK_TAG_RE = (
+ r"(?!(?:%s)\b)\w+(?!:/|[^\w\s@]*@)\b" % "|".join(sorted(_LEGACY_INLINE_TAGS))
+)
+_LEGACY_BLOCK_HTML_RE = re.compile(
+ r'^\s* *(?:'
+ r''
+ r'|<(' + _LEGACY_BLOCK_TAG_RE + r')((?:' + _LEGACY_VALID_ATTR_RE + r')*?)>([\s\S]+?)<\/\1>'
+ r'|<' + _LEGACY_BLOCK_TAG_RE + r'(?:' + _LEGACY_VALID_ATTR_RE + r')*?>'
+ r') *(?:\n{1,}|\s*$)'
+)
- self.tokens.append(node)
- def parse_linebreak(self, m):
- node = NewLine()
- self.tokens.append(node)
+def _split_legacy_block_html(raw: str) -> tuple[str, str] | None:
+ """Split over-greedy HTML blocks produced by mistune 3.
- def parse_text(self, m):
- text = m.group(0)
- if text.strip():
- escaped_text = mistune.escape(text)
- node = Text(escaped_text)
- self.tokens.append(node)
+ Mistune 0.x treats a line like `` as a single HTML block and continues parsing
+ following Markdown lines. Mistune 3 follows CommonMark and may consume subsequent lines
+ until a blank line, which changes our structural tree.
+ """
+ if not raw or '\n' not in raw:
+ return None
+ match = _LEGACY_BLOCK_HTML_RE.match(raw)
+ if match is None:
+ return None
+ end = match.end()
+ if end >= len(raw):
+ return None
+ return raw[:end], raw[end:]
+
+
+class _SdiffBlockParser(block_parser.BlockParser):
+ """Mistune block parser tweaked for legacy-compat structure diffs.
+
+ The master branch (mistune 0.x) did not treat fenced code blocks or block quotes
+ as special blocks. We disable them so they are parsed as normal text and then
+ normalized in our conversion layer.
+ """
+
+ def parse_fenced_code(self, m, state): # noqa: ANN001
+ return None
+
+ def parse_block_quote(self, m, state): # noqa: ANN001
+ return None
+
+ def parse_raw_html(self, m, state): # noqa: ANN001
+ """Parse raw HTML more like mistune 0.x.
+
+ In mistune 3, unknown tags are "type 7" HTML blocks and may not interrupt
+ paragraphs. The legacy mistune 0.x parser used in `master` treats any
+ non-inline tag as block HTML and it can interrupt paragraphs.
+ """
+ marker = m.group(0).strip()
+ # Legacy parser does not recognize closing tags alone as block HTML.
+ if marker.startswith(''):
+ return None
-class MdParser(mistune.BlockLexer):
- default_rules = [
- 'newline', 'list_block', 'block_html',
- 'heading', 'lheading',
- 'paragraph', 'text',
- ]
+ # Defer to the upstream implementation for comments and other directives.
+ if marker in {'',
- r'<({})((?:{})*?)>([\s\S]+?)<\/\1>'.format(mistune._block_tag, mistune._valid_attr),
- r'<{}(?:{})*?>'.format(mistune._block_tag, mistune._valid_attr),
- )
- )
-
- def _parse_inline(self, text):
- inline = InlineLexer()
- return inline.parse(text)
-
- def parse_newline(self, m):
- length = len(m.group(0))
- if length > 1:
- self.tokens.append(NewLine())
-
- def parse_heading(self, m):
- level = len(m.group(1))
- node = Header(level)
- node.add_nodes(self._parse_inline(m.group(2)))
- self.tokens.append(node)
-
- def parse_lheading(self, m):
- level = 1 if m.group(2) == '=' else 2
- text = m.group(1)
- node = Header(level)
- node.add_nodes(self._parse_inline(text))
- self.tokens.append(node)
-
- def parse_block_html(self, m):
- text = m.group(0)
- html = Html(text)
- self.tokens.append(html)
-
- def parse_paragraph(self, m):
- text = m.group(1).rstrip('\n')
- node = Paragraph()
- node.add_nodes(self._parse_inline(text))
- self.tokens.append(node)
-
- def parse_text(self, m):
- text = m.group(0)
- escaped_text = mistune.escape(text)
- node = Text(escaped_text)
- self.tokens.append(node)
-
- def parse_list_block(self, m):
- bull = m.group(2)
- cap = m.group(0)
- ordered = '.' in bull
- node = List(ordered)
- node.add_nodes(self._process_list_item(cap, bull))
- self.tokens.append(node)
-
- def _process_list_item(self, cap, bull):
- result = []
- cap = self.rules.list_item.findall(cap)
-
- _next = False
- length = len(cap)
-
- for i in range(length):
- item = cap[i][0]
-
- # remove the bullet
- space = len(item)
- item = self.rules.list_bullet.sub('', item)
-
- # outdent
- if '\n ' in item:
- space = space - len(item)
- pattern = re.compile(r'^ {1,%d}' % space, flags=re.M)
- item = pattern.sub('', item)
-
- # determine whether item is loose or not
- loose = _next
- if not loose and re.search(r'\n\n(?!\s*$)', item):
- loose = True
-
- rest = len(item)
- if i != length - 1 and rest:
- _next = item[rest - 1] == '\n'
- if not loose:
- loose = _next
-
- node = ListItem()
- block_lexer = self.get_lexer()
- nodes = block_lexer.parse(item, self.list_rules)
- node.add_nodes(nodes)
- result.append(node)
- return result
+ block = _SdiffBlockParser()
+ # Don't recognize fences/quotes as block-level syntax; see _SdiffBlockParser.
+ for rule in ('fenced_code', 'block_quote'):
+ if rule in block.rules:
+ block.rules.remove(rule)
+
+ # In mistune 0.x the list parser does not include the `block_html` / `raw_html`
+ # rule, so HTML-like lines inside list items become plain text (not Html nodes)
+ # and don't swallow following Markdown.
+ if 'raw_html' in getattr(block, 'list_rules', []):
+ block.list_rules.remove('raw_html')
+
+ inline = mistune.InlineParser()
+ # Prevent code spans from consuming legacy fence markers like ```...```.
+ if 'codespan' in inline.rules:
+ inline.rules.remove('codespan')
+
+ self._markdown = mistune.Markdown(renderer=None, block=block, inline=inline)
+ self._reference_definitions = {}
+
+ def parse(self, text, rules=None):
+ """Parse Markdown text into a list of Node objects.
+
+ Args:
+ text: Markdown string.
+ rules: Optional rules argument kept for compatibility.
+
+ Returns:
+ list[Node]
+ """
+ tokens = self._markdown(text)
+ return self._convert_block_tokens(tokens)
+
+ def _set_reference_definitions(self, definitions):
+ self._reference_definitions = definitions
+
+ def _convert_block_tokens(self, tokens: Iterable[dict]):
+ nodes = []
+ for token in tokens:
+ nodes.extend(self._convert_block_token(token))
+ return nodes
+
+ def _convert_block_token(self, token):
+ token_type = token.get('type')
+ if token_type == 'paragraph':
+ return self._convert_paragraph_token(token.get('children', []))
+ if token_type == 'heading':
+ return [self._convert_heading(token)]
+ if token_type == 'list':
+ return [self._convert_list(token)]
+ if token_type == 'list_item':
+ return [self._convert_list_item(token)]
+ if token_type == 'block_text':
+ return [self._convert_paragraph_or_heading(token.get('children', []))]
+ if token_type == 'block_html':
+ return self._convert_block_html(token)
+ if token_type == 'block_quote':
+ return self._convert_block_quote(token)
+ if token_type == 'block_code':
+ return self._convert_block_code(token)
+ if token_type == 'thematic_break':
+ return self._convert_passthrough_block(token)
+ return self._convert_passthrough_block(token)
+
+ def _convert_heading(self, token):
+ level = token.get('level') or token.get('attrs', {}).get('level', 1)
+ header = Header(level)
+ header.add_nodes(self._convert_inline_tokens(token.get('children', [])))
+ return header
+
+ def _convert_list(self, token):
+ ordered = token.get('ordered')
+ if ordered is None:
+ ordered = token.get('attrs', {}).get('ordered', False)
+ list_node = List(bool(ordered))
+ for item in token.get('children', []):
+ list_node.add_node(self._convert_list_item(item))
+ return list_node
+
+ def _convert_block_html(self, token):
+ raw = token.get('raw', '')
+ if _is_block_html(raw):
+ split = _split_legacy_block_html(raw)
+ if split is None:
+ return [Html(raw)]
+ prefix, suffix = split
+ nodes = [Html(prefix)]
+ if suffix and suffix.strip():
+ nodes.extend(self._convert_block_tokens(self._markdown(suffix)))
+ return nodes
+ text = mistune.escape(raw)
+ if text.strip():
+ return [Paragraph([Text(text)])]
+ return []
+
+ def _convert_passthrough_block(self, token):
+ child_nodes = self._convert_block_tokens(token.get('children', []))
+ if child_nodes:
+ return child_nodes
+ raw = token.get('raw') or token.get('text') or ''
+ if raw.strip():
+ return [Paragraph([Text(mistune.escape(raw))])]
+ return []
+
+ def _convert_block_quote(self, token):
+ children = token.get('children', [])
+ if not children:
+ return []
+ content = self._render_inline_children(children)
+ if not content.strip():
+ return []
+ lines = content.splitlines()
+ quoted = '\n'.join([f'> {line}' if line.strip() else '>' for line in lines])
+ return [Paragraph([Text(mistune.escape(quoted))])]
+
+ def _convert_block_code(self, token):
+ raw = token.get('raw') or ''
+ marker = token.get('marker') or '```'
+ fence = marker if marker else '```'
+ content = raw.rstrip('\n')
+ code_block = f'{fence}\n{content}\n{fence}'
+ return [Paragraph([Text(mistune.escape(code_block))])]
+
+ def _render_inline_children(self, children):
+ parts = []
+ for child in children:
+ child_type = child.get('type')
+ if child_type in {'paragraph', 'block_text'}:
+ parts.append(self._flatten_inline_text(child.get('children', [])))
+ else:
+ raw = child.get('raw') or child.get('text') or ''
+ if raw:
+ parts.append(raw)
+ return '\n'.join([part for part in parts if part is not None])
+
+ def _convert_list_item(self, token):
+ item = ListItem()
+ for child in token.get('children', []):
+ child_type = child.get('type')
+ if child_type in {'block_text', 'paragraph'}:
+ item.add_nodes(self._convert_list_block_nodes(child.get('children', [])))
+ elif child_type == 'block_html':
+ item.add_nodes(self._convert_list_item_block_html(child))
+ else:
+ item.add_nodes(self._convert_block_tokens([child]))
+ return item
+
+ def _convert_list_item_block_html(self, token):
+ # In mistune 0.x the list parser does not include the `block_html` rule,
+ # so HTML-like lines inside list items become plain text (not Html nodes).
+ raw = token.get('raw', '') or ''
+ if not raw.strip():
+ return []
+
+ split = _split_legacy_block_html(raw)
+ if split is None:
+ prefix, suffix = raw, ''
+ else:
+ prefix, suffix = split
+
+ nodes = []
+ _append_text(nodes, mistune.escape(prefix))
+ if suffix and suffix.strip():
+ nodes.extend(self._convert_list_item_block_html_text(suffix))
+ return nodes
+
+ def _convert_list_item_block_html_text(self, text: str):
+ nodes = []
+ for child in self._markdown(text):
+ child_type = child.get('type')
+ if child_type in {'block_text', 'paragraph'}:
+ nodes.extend(self._convert_list_block_nodes(child.get('children', [])))
+ elif child_type == 'heading':
+ nodes.append(self._convert_heading(child))
+ elif child_type == 'list':
+ nodes.append(self._convert_list(child))
+ elif child_type == 'list_item':
+ nodes.append(self._convert_list_item(child))
+ elif child_type == 'block_html':
+ nodes.extend(self._convert_list_item_block_html(child))
+ else:
+ raw = child.get('raw') or child.get('text') or ''
+ if raw.strip():
+ _append_text(nodes, mistune.escape(raw))
+ return nodes
+
+ def _convert_inline_tokens(self, tokens: Iterable[dict]):
+ nodes = []
+ buffer = ''
+
+ def flush_buffer():
+ nonlocal buffer
+ if buffer:
+ for part in _split_text_on_legacy_markers(buffer):
+ self._split_reference_links(part, nodes)
+ buffer = ''
+
+ handlers = {
+ 'text': self._handle_inline_text,
+ 'inline_html': self._handle_inline_text,
+ 'block_html': self._handle_inline_text,
+ 'codespan': self._handle_inline_codespan,
+ 'softbreak': self._handle_inline_softbreak,
+ 'linebreak': self._handle_inline_linebreak,
+ 'link': self._handle_inline_link,
+ 'image': self._handle_inline_image,
+ 'strong': self._handle_inline_marker,
+ 'emphasis': self._handle_inline_marker,
+ 'strikethrough': self._handle_inline_marker,
+ }
+
+ for token in tokens:
+ token_type = token.get('type')
+ handler = handlers.get(token_type)
+ if handler:
+ buffer = handler(token, nodes, buffer, flush_buffer)
+ else:
+ buffer = self._handle_inline_other(token, nodes, buffer, flush_buffer)
+
+ flush_buffer()
+ return nodes
+
+ def _handle_inline_text(self, token, nodes, buffer, flush_buffer):
+ raw = token.get('raw', '')
+ buffer += self._reference_definitions.get(raw, raw)
+ return buffer
+
+ def _handle_inline_codespan(self, token, nodes, buffer, flush_buffer):
+ buffer += f"`{token.get('raw') or token.get('text') or ''}`"
+ return buffer
+
+ def _handle_inline_softbreak(self, token, nodes, buffer, flush_buffer):
+ buffer += ' '
+ return buffer
+
+ def _handle_inline_linebreak(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ nodes.append(NewLine())
+ return ''
+
+ def _handle_inline_link(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ text = self._flatten_inline_text(token.get('children', []))
+ attrs = token.get('attrs', {})
+ url = _unquote_url_if_template(attrs.get('url', ''))
+ title = attrs.get('title')
+ nodes.append(Link(_format_link_markup(text, url, title)))
+ return ''
+
+ def _handle_inline_image(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', []))
+ attrs = token.get('attrs', {})
+ url = _unquote_url_if_template(attrs.get('url', ''))
+ title = attrs.get('title')
+ nodes.append(Image(_format_image_markup(alt, url, title)))
+ return ''
+
+ def _handle_inline_marker(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ marker = _INLINE_MARKERS[token.get('type')]
+ _append_text(nodes, marker)
+ children = token.get('children', [])
+ if children:
+ nodes.extend(self._convert_inline_tokens(children))
+ _append_text(nodes, marker)
+ return ''
+
+ def _handle_inline_other(self, token, nodes, buffer, flush_buffer):
+ flush_buffer()
+ children = token.get('children', [])
+ if children:
+ nodes.extend(self._convert_inline_tokens(children))
+ else:
+ raw = token.get('raw') or token.get('text') or ''
+ if raw.strip():
+ _append_text(nodes, mistune.escape(raw))
+ return ''
+
+ def _flatten_inline_text(self, tokens: Iterable[dict]):
+ parts = []
+ for token in tokens:
+ token_type = token.get('type')
+ if token_type in {'text', 'inline_html', 'block_html'}:
+ raw = token.get('raw') or token.get('text') or ''
+ parts.append(self._reference_definitions.get(raw, raw))
+ elif token_type == 'codespan':
+ parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
+ elif token_type in _INLINE_MARKERS:
+ marker = _INLINE_MARKERS[token_type]
+ inner = self._flatten_inline_text(token.get('children', []))
+ parts.append(f'{marker}{inner}{marker}')
+ elif token_type in {'linebreak', 'softbreak'}:
+ parts.append(' ')
+ else:
+ children = token.get('children', [])
+ if children:
+ parts.append(self._flatten_inline_text(children))
+ else:
+ parts.append(token.get('raw') or token.get('text') or '')
+ return ''.join(parts).strip()
+
+ def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]):
+ ref_text = self._reference_definition_text(inline_tokens)
+ if ref_text is not None:
+ return Paragraph([Text(ref_text)])
+ heading = self._heading_from_inline(inline_tokens)
+ if heading:
+ return heading
+ return Paragraph(self._convert_inline_tokens(inline_tokens))
+
+ def _convert_paragraph_token(self, inline_tokens: Iterable[dict]):
+ ref_text = self._reference_definition_text(inline_tokens)
+ if ref_text is not None:
+ return [Paragraph([Text(ref_text)])]
+ heading = self._heading_from_inline(inline_tokens)
+ if heading:
+ return [heading]
+
+ split = self._split_paragraph_inline_on_fence(inline_tokens)
+ if split is not None:
+ nodes = []
+ for part in split:
+ children = self._convert_inline_tokens(part)
+ if children:
+ nodes.append(Paragraph(children))
+ if nodes:
+ return nodes
+
+ return [Paragraph(self._convert_inline_tokens(inline_tokens))]
+
+ def _split_paragraph_inline_on_fence(self, inline_tokens: Iterable[dict]):
+ # Legacy mistune 0.x breaks paragraphs when it encounters a fence-only marker
+ # line (``` / ~~~), even though we treat fences as plain text blocks.
+ if not inline_tokens:
+ return None
+
+ lines = [[]]
+ seps = []
+ for token in inline_tokens:
+ token_type = token.get('type')
+ if token_type in {'softbreak', 'linebreak'}:
+ seps.append(token)
+ lines.append([])
+ else:
+ lines[-1].append(token)
+
+ if len(lines) <= 1:
+ return None
+
+ line_texts = [self._flatten_inline_markup(line) for line in lines]
+
+ def fence_marker(tokens):
+ raw = self._flatten_inline_markup(tokens).strip()
+ match = _FENCE_ONLY_LINE_RE.match(raw)
+ if match is None:
+ return None
+ return match.group(1)
+
+ if fence_marker(lines[0]) is not None:
+ return None
+
+ split_idx = None
+ for idx in range(1, len(lines)):
+ marker = fence_marker(lines[idx])
+ if marker is None:
+ continue
+ # Only split when this fence line begins a complete fence block according
+ # to mistune 0.x's `fences` regex. This avoids breaking on sequences like
+ # ```\n``` which mistune 0.x does not treat as a fence block (no content).
+ tail = '\n'.join(line_texts[idx:])
+ if _MISTUNE08_FENCE_BLOCK_RE.match(tail):
+ split_idx = idx
+ break
+
+ if split_idx is None:
+ return None
+
+ first = []
+ for idx, line in enumerate(lines[:split_idx]):
+ first.extend(line)
+ if idx < split_idx - 1:
+ first.append(seps[idx])
+
+ second = []
+ for line_idx in range(split_idx, len(lines)):
+ second.extend(lines[line_idx])
+ if line_idx < len(lines) - 1:
+ second.append(seps[line_idx])
+
+ parts = []
+ if first:
+ parts.append(first)
+ if second:
+ parts.append(second)
+ return parts if len(parts) > 1 else None
+
+ def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]):
+ text = self._flatten_inline_markup(inline_tokens, softbreak_as_newline=True)
+ if not text or not text.strip():
+ return []
+
+ nodes = []
+ for line in text.splitlines():
+ if not line.strip():
+ continue
+
+ ref_text = self._reference_definitions.get(line)
+ if ref_text is not None:
+ nodes.append(Text(ref_text))
+ continue
+
+ heading = self._heading_from_inline([{'type': 'text', 'raw': line}])
+ if heading:
+ nodes.append(heading)
+ continue
+
+ nodes.append(Text(mistune.escape(line)))
+
+ return nodes
+
+ def _flatten_inline_markup(self, tokens: Iterable[dict], *, softbreak_as_newline: bool = False):
+ parts = []
+ for token in tokens:
+ token_type = token.get('type')
+ if token_type in {'text', 'inline_html', 'block_html'}:
+ raw = token.get('raw') or token.get('text') or ''
+ parts.append(self._reference_definitions.get(raw, raw))
+ elif token_type == 'link':
+ label = self._flatten_inline_markup(
+ token.get('children', []),
+ softbreak_as_newline=softbreak_as_newline,
+ )
+ attrs = token.get('attrs', {})
+ url = _unquote_url_if_template(attrs.get('url', ''))
+ title = attrs.get('title')
+ parts.append(_format_link_markup(label, url, title))
+ elif token_type == 'image':
+ alt = token.get('attrs', {}).get('alt') or self._flatten_inline_markup(
+ token.get('children', []),
+ softbreak_as_newline=softbreak_as_newline,
+ )
+ attrs = token.get('attrs', {})
+ url = _unquote_url_if_template(attrs.get('url', ''))
+ title = attrs.get('title')
+ parts.append(_format_image_markup(alt, url, title))
+ elif token_type == 'softbreak':
+ parts.append('\n' if softbreak_as_newline else ' ')
+ elif token_type == 'linebreak':
+ parts.append('\n')
+ elif token_type == 'codespan':
+ parts.append(f"`{token.get('raw') or token.get('text') or ''}`")
+ elif token_type in _INLINE_MARKERS:
+ marker = _INLINE_MARKERS[token_type]
+ inner = self._flatten_inline_markup(
+ token.get('children', []),
+ softbreak_as_newline=softbreak_as_newline,
+ )
+ parts.append(f'{marker}{inner}{marker}')
+ else:
+ children = token.get('children', [])
+ if children:
+ parts.append(self._flatten_inline_markup(children, softbreak_as_newline=softbreak_as_newline))
+ else:
+ parts.append(token.get('raw') or token.get('text') or '')
+ return ''.join(parts)
+
+ def _heading_from_inline(self, inline_tokens: Iterable[dict]):
+ if len(inline_tokens) != 1:
+ return None
+ token = inline_tokens[0]
+ if token.get('type') != 'text':
+ return None
+ raw = token.get('raw', '')
+ match = _HEADING_LINE_RE.match(raw)
+ if not match:
+ return None
+ level = len(match.group(2))
+ content = raw[match.end(2):].lstrip()
+ heading_tokens = self._markdown(f"{'#' * level} {content}")
+ if heading_tokens and heading_tokens[0].get('type') == 'heading':
+ children = heading_tokens[0].get('children', [])
+ else:
+ children = [{'type': 'text', 'raw': content}]
+ header = Header(level)
+ header.add_nodes(self._convert_inline_tokens(children))
+ return header
+
+ def _reference_definition_text(self, inline_tokens: Iterable[dict]):
+ if len(inline_tokens) != 1:
+ return None
+ token = inline_tokens[0]
+ if token.get('type') != 'text':
+ return None
+ raw = token.get('raw', '')
+ return self._reference_definitions.get(raw)
+
+ def _split_reference_links(self, raw: str, nodes):
+ last = 0
+ for match in _REF_LINK_OR_IMAGE_RE.finditer(raw):
+ if match.start() > last:
+ _append_text(nodes, mistune.escape(raw[last:match.start()]))
+ snippet = match.group(0)
+ if snippet.startswith('!['):
+ nodes.append(Image(snippet))
+ else:
+ nodes.append(Link(snippet))
+ last = match.end()
+ if last < len(raw):
+ _append_text(nodes, mistune.escape(raw[last:]))
+ return nodes
class ZendeskHelpMdParser(MdParser):
- TAG_CONTENT_GROUP = 'tag_content'
- TAG_PATTERN = r'^\s*(<{tag_name}{attr_re}>(?P<%s>[\s\S]+?){tag_name}>)\s*$' % TAG_CONTENT_GROUP
- CALLOUT_STYLE_GROUP = 'style'
- CALLOUT_ATTR_PATTERN = r'( (?P<%s>green|red|yellow))*' % CALLOUT_STYLE_GROUP
+ _CALLOUT_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*[^>]*)>(?P.*?)')
+ _CALLOUT_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*[^>]*)>(?P.*)')
+ _STEPS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*(?P.*?)')
+ _STEPS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*(?P.*)')
+ _TABS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*(?P.*?)')
+ _TABS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*(?P.*)')
+
+ def parse(self, text, rules=None):
+ """Parse Markdown with Zendesk tag support into a list of Node objects."""
+ nodes = self._parse_nodes(text)
+ return nodes
+
+ def _parse_nodes(self, text: str):
+ nodes = []
+ remaining = text
+ while remaining:
+ tag_name = None
+ match = None
+ search_at = 0
+ while True:
+ tag_name, match = self._find_next_tag(remaining, start_at=search_at)
+ if not match:
+ break
+ absolute_start = (len(text) - len(remaining)) + match.start()
+ if _is_inside_list_block(text, absolute_start):
+ # The legacy mistune 0.x list parser treats block-level content
+ # lazily; Zendesk tags that appear inside list items become plain
+ # text and are not recognized structurally. Avoid splitting the
+ # input at such tags, since that would terminate the list early.
+ search_at = match.start() + 1
+ continue
+ break
+ if not match:
+ nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining)))
+ break
+
+ if match.start() > 0:
+ prefix = remaining[:match.start()]
+ nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix)))
+
+ # The legacy parser only recognizes Zendesk tags when they consume the
+ # remainder of the current parsing slice (it uses `\\s*$` in the rule
+ # regex). Because of this, it will also match *across* multiple tag
+ # blocks of the same kind if the last closing tag is at the end.
+ #
+ # We emulate this by preferring a greedy match when it is terminal.
+ terminal_match = None
+ tail = remaining[match.start():]
+ if tag_name == 'callout':
+ m2 = self._CALLOUT_PATTERN_MAX.match(tail)
+ elif tag_name == 'steps':
+ m2 = self._STEPS_PATTERN_MAX.match(tail)
+ else:
+ m2 = self._TABS_PATTERN_MAX.match(tail)
+ if m2 is not None and not tail[m2.end():].strip():
+ terminal_match = m2
+
+ if terminal_match is None:
+ # Non-terminal: treat the first (minimal) tag block as opaque HTML.
+ nodes.append(Html(match.group(0)))
+ remaining = remaining[match.end():]
+ continue
+
+ content = terminal_match.group('content')
+ trailing = tail[terminal_match.end():]
+
+ if tag_name == 'callout':
+ attrs = (terminal_match.group('attrs') or '').strip()
+ styles = [part for part in attrs.split() if part]
+ if not styles:
+ node = ZendeskHelpCallout(None)
+ elif len(styles) == 1 and styles[0] in {'green', 'red', 'yellow'}:
+ node = ZendeskHelpCallout(styles[0])
+ else:
+ # Invalid callout attrs: legacy parser does not treat this as a
+ # Zendesk callout block. Keep the first (minimal) tag as opaque
+ # HTML and continue parsing the remaining text.
+ nodes.append(Html(match.group(0)))
+ remaining = remaining[match.end():]
+ continue
+ elif tag_name == 'steps':
+ node = ZendeskHelpSteps()
+ else:
+ node = ZendeskHelpTabs()
+
+ node.add_nodes(self._parse_nodes(content))
+ nodes.append(node)
+
+ remaining = trailing
+ return nodes
+
+ def _find_next_tag(self, text: str, start_at: int = 0):
+ best = None
+ for name, pattern in (
+ ('callout', self._CALLOUT_PATTERN_MIN),
+ ('steps', self._STEPS_PATTERN_MIN),
+ ('tabs', self._TABS_PATTERN_MIN),
+ ):
+ for match in pattern.finditer(text, start_at):
+ candidate = (match.start(), name, match)
+ if best is None or candidate[0] < best[0]:
+ best = candidate
+ break
+
+ if best is None:
+ return None, None
+ _, name, match = best
+ return name, match
+
+ def _parse_markdown(self, text: str):
+ normalized = _remove_spaces_from_empty_lines(text)
+ normalized = _remove_ltr_rtl_marks(normalized)
+ return self._convert_block_tokens(self._markdown(normalized))
+
+
+def _append_text(nodes, text):
+ if not text or not text.strip():
+ return
+ nodes.append(Text(text))
- def __init__(self):
- super().__init__()
- self.grammar_class.callout = re.compile(self.TAG_PATTERN.format(tag_name='callout',
- attr_re=self.CALLOUT_ATTR_PATTERN))
- self.default_rules.insert(0, 'callout')
- self.grammar_class.steps = re.compile(self.TAG_PATTERN.format(tag_name='steps', attr_re=''))
- self.default_rules.insert(0, 'steps')
+def _split_text_on_legacy_markers(raw: str) -> list[str]:
+ """Split text into segments similar to mistune 0.x inline text tokenization.
- self.grammar_class.tabs = re.compile(self.TAG_PATTERN.format(tag_name='tabs', attr_re=''))
- self.default_rules.insert(0, 'tabs')
+ The legacy parser splits text at backticks and tildes (it stops before those
+ markers and then consumes them as separate text tokens). This matters for our
+ structural tree because each segment becomes its own Text node.
+ """
+ if not raw:
+ return []
+ markers = ('`', '~')
+ out = []
+ i = 0
+ n = len(raw)
+ while i < n:
+ j = n
+ for m in markers:
+ pos = raw.find(m, i + 1)
+ if pos != -1 and pos < j:
+ j = pos
+ out.append(raw[i:j])
+ i = j
+ return out
- def parse_callout(self, m: Match[str]) -> None:
- style = m.group(self.CALLOUT_STYLE_GROUP)
- self._parse_nested(ZendeskHelpCallout(style), m)
- def parse_steps(self, m: Match[str]) -> None:
- self._parse_nested(ZendeskHelpSteps(), m)
+def _format_title(title: str) -> str:
+ if title is None:
+ return ''
+ escaped = title.replace('"', '\\"')
+ return f' "{escaped}"'
- def parse_tabs(self, m: Match[str]) -> None:
- self._parse_nested(ZendeskHelpTabs(), m)
- def _parse_nested(self, node: Node, m: Match[str]) -> None:
- nested_content = m.group(self.TAG_CONTENT_GROUP)
- nested_nodes = self.get_lexer().parse(nested_content)
- node.add_nodes(nested_nodes)
- self.tokens.append(node)
+def _unquote_url_if_template(url: str) -> str:
+ """Undo Mistune's percent-encoding for template-like URLs.
+
+ Mistune percent-encodes some characters in URLs (e.g. `{{url}}` becomes `%7B%7Burl%7D%7D`).
+ For structural diffs we don't care about URL contents, but we do want rendered markup to remain
+ readable and close to the original input.
+ """
+ if not url or '%' not in url:
+ return url
+ unquoted = unquote(url)
+ if unquoted != url and ('{' in unquoted or '}' in unquoted):
+ return unquoted
+ return url
+
+
+def _format_link_markup(text: str, url: str, title: str | None) -> str:
+ return f'[{text}]({url}{_format_title(title)})'
+
+
+def _format_image_markup(alt: str, url: str, title: str | None) -> str:
+ return f'})'
+
+
+def _is_block_html(raw: str) -> bool:
+ stripped = raw.lstrip()
+ if stripped.startswith('"))
+ self.assertFalse(parser._is_block_html("text"))
+ self.assertTrue(parser._is_block_html("text
"))
+ self.assertFalse(parser._is_block_html("nope"))
+
+ def test_normalize_block_indentation(self):
+ # Only non-HTML lines should be considered for min-indent normalization.
+ raw = " \n x\n
\n y"
+ normalized = parser._normalize_block_indentation(raw)
+ self.assertIn("y", normalized)
+
+ def test_extract_reference_definitions_fence_special_case(self):
+ raw = "[id]: https://example.com\n```\n\n```"
+ text, defs = parser._extract_reference_definitions(raw)
+ self.assertEqual(1, len(defs))
+ # The special-case inserts a blank line after the placeholder.
+ self.assertTrue(text.startswith("SDIFF_REF_DEF_0\n\n"))
+
+ def test_extract_reference_definitions_fence_special_case_not_triggered_without_blank_line(self):
+ raw = "[id]: https://example.com\n```\n```"
+ text, defs = parser._extract_reference_definitions(raw)
+ self.assertEqual(1, len(defs))
+ self.assertEqual("SDIFF_REF_DEF_0\n```\n```", text)
+
+ def test_is_inside_fenced_block(self):
+ raw = "```\ncode\n```\noutside"
+ # Offset inside "code".
+ self.assertTrue(parser._is_inside_fenced_block(raw, raw.index("code")))
+ # Offset inside "outside".
+ self.assertFalse(parser._is_inside_fenced_block(raw, raw.index("outside")))
+ # Offset past end => fall through.
+ self.assertFalse(parser._is_inside_fenced_block(raw, len(raw) + 1))
+
+ def test_is_inside_list_block(self):
+ raw = "- a\n b\n\nc"
+ self.assertTrue(parser._is_inside_list_block(raw, raw.index("b")))
+ self.assertFalse(parser._is_inside_list_block(raw, raw.index("c")))
+ # Offset past end => fall through.
+ self.assertFalse(parser._is_inside_list_block(raw, len(raw) + 1))
+
+ def test_normalize_consecutive_fence_lines(self):
+ raw = "```\n```\ntext"
+ normalized = parser._normalize_consecutive_fence_lines(raw)
+ self.assertIn("```\n\n```", normalized)
+
+ def test_normalize_consecutive_blockquote_lines(self):
+ raw = "> a\n> b\nc"
+ normalized = parser._normalize_consecutive_blockquote_lines(raw)
+ self.assertIn("> a\n\n> b", normalized)
+
+ def test_normalize_fence_only_lines_start_new_paragraphs(self):
+ raw = "a\n```\nb"
+ normalized = parser._normalize_fence_only_lines_start_new_paragraphs(raw)
+ self.assertIn("a\n\n```", normalized)
+ # Blank line resets state.
+ normalized = parser._normalize_fence_only_lines_start_new_paragraphs("a\n\n```\n\n```")
+ self.assertIn("\n\n```\n\n```", normalized)
+
+ def test_normalize_double_blank_line_list_nesting_does_not_overindent(self):
+ raw = "* a\n\n\n * b\n"
+ normalized = parser._normalize_double_blank_line_list_nesting(raw)
+ self.assertEqual(raw, normalized)
+
+ def test_merge_adjacent_lists(self):
+ l1 = List(False, [ListItem([Text("a")])])
+ l2 = List(True, [ListItem([Text("b")])])
+ root = Root([l1, l2])
+ merged = parser._merge_adjacent_lists(root.nodes)
+ self.assertEqual(1, len(merged))
+ self.assertEqual(2, len(merged[0].nodes))
+
+ def test_parse_passthrough_when_parser_returns_non_list(self):
+ class _Dummy(MdParser):
+ def parse(self, text, rules=None): # noqa: ANN001
+ return Root([Text("x")])
+
+ parsed = parser.parse("x", parser_cls=_Dummy)
+ self.assertIsInstance(parsed, Root)
+
+
+class TestCoverageParserConversions(TestCase):
+ def setUp(self) -> None:
+ super().setUp()
+ self.p = MdParser()
+
+ def test_convert_block_token_branches(self):
+ item = self.p._convert_block_token(
+ {
+ "type": "list_item",
+ "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}],
+ }
+ )[0]
+ self.assertEqual("list-item", item.name)
+
+ block_text = self.p._convert_block_token({"type": "block_text", "children": [{"type": "text", "raw": "x"}]})[0]
+ self.assertEqual("paragraph", block_text.name)
+
+ quote = self.p._convert_block_token(
+ {"type": "block_quote", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "q"}]}]}
+ )[0]
+ self.assertEqual("paragraph", quote.name)
+ self.assertIn(">", quote.nodes[0].text)
+
+ code = self.p._convert_block_token({"type": "block_code", "raw": "code\n", "marker": "```"})[0]
+ self.assertTrue(code.nodes[0].text.startswith("```"))
+
+ def test_convert_list_ordered_attr_fallback(self):
+ lst = self.p._convert_list({"type": "list", "attrs": {"ordered": True}, "children": []})
+ self.assertTrue(lst.ordered)
+
+ def test_convert_block_html_with_suffix(self):
+ token = {"type": "block_html", "raw": "hi
\n\ntext"}
+ nodes = self.p._convert_block_html(token)
+ self.assertEqual("html", nodes[0].name)
+ self.assertEqual("paragraph", nodes[1].name)
+
+ # Split happens, but suffix is whitespace-only => no extra nodes.
+ token = {"type": "block_html", "raw": "hi
\n\n "}
+ nodes = self.p._convert_block_html(token)
+ self.assertEqual(1, len(nodes))
+
+ # Whitespace-only raw => empty conversion.
+ self.assertEqual([], self.p._convert_block_html({"type": "block_html", "raw": " "}))
+
+ def test_convert_passthrough_block_children_and_raw(self):
+ out = self.p._convert_passthrough_block(
+ {"type": "unknown", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}]}
+ )
+ self.assertEqual("paragraph", out[0].name)
+ out2 = self.p._convert_passthrough_block({"type": "unknown", "raw": "raw"})
+ self.assertEqual("paragraph", out2[0].name)
+
+ def test_convert_block_quote_early_returns(self):
+ self.assertEqual([], self.p._convert_block_quote({"type": "block_quote", "children": []}))
+ self.assertEqual(
+ [],
+ self.p._convert_block_quote({"type": "block_quote", "children": [{"type": "paragraph", "children": []}]}),
+ )
+
+ def test_render_inline_children_unknown_child_type(self):
+ out = self.p._render_inline_children([{"type": "thematic_break", "raw": "---"}])
+ self.assertEqual("---", out)
+
+ def test_inline_other_and_codespan_text_fallback(self):
+ tokens = [{"type": "codespan", "text": "x"}, {"type": "unknown", "raw": ""}]
+ out = self.p._convert_inline_tokens(tokens)
+ self.assertEqual("`x`<x>", "".join(node.text for node in out))
+
+ def test_inline_marker_without_children_and_inline_other_with_children(self):
+ out = self.p._convert_inline_tokens([{"type": "strong", "children": []}])
+ self.assertEqual(["text", "text"], [n.name for n in out])
+
+ out = self.p._convert_inline_tokens([{"type": "unknown", "children": [{"type": "text", "raw": "x"}]}])
+ self.assertEqual("x", out[0].text)
+
+ out = self.p._convert_inline_tokens([{"type": "unknown", "raw": " "}])
+ self.assertEqual([], out)
+
+ def test_flatten_inline_text_unknown_branches(self):
+ text = self.p._flatten_inline_text(
+ [
+ {"type": "codespan", "raw": "x"},
+ {"type": "unknown", "children": [{"type": "text", "raw": "y"}]},
+ {"type": "unknown", "raw": "z"},
+ ]
+ )
+ self.assertIn("`x`", text)
+ self.assertTrue(text.endswith("z"))
+
+ def test_flatten_inline_markup_link_and_image(self):
+ tokens = [
+ {"type": "text", "raw": "a"},
+ {"type": "softbreak"},
+ {"type": "link", "children": [{"type": "text", "raw": "L"}], "attrs": {"url": "%7B%7Burl%7D%7D"}},
+ {"type": "softbreak"},
+ {"type": "image", "children": [{"type": "text", "raw": "A"}], "attrs": {"url": "u", "title": 't"'}},
+ ]
+ s = self.p._flatten_inline_markup(tokens, softbreak_as_newline=True)
+ self.assertIn("[L]({{url}})", s)
+ self.assertIn('', s)
+
+ def test_flatten_inline_markup_unknown_branches(self):
+ tokens = [
+ {"type": "unknown", "children": [{"type": "text", "raw": "x"}]},
+ {"type": "unknown", "raw": "y"},
+ ]
+ s = self.p._flatten_inline_markup(tokens)
+ self.assertEqual("xy", s)
+
+ def test_convert_list_block_nodes_ref_heading_and_text(self):
+ self.p._set_reference_definitions(
+ {
+ "SDIFF_REF_DEF_0": "[id]: https://example.com",
+ "[id]: https://example.com": "[id]: https://example.com",
+ }
+ )
+ tokens = [
+ {"type": "text", "raw": "SDIFF_REF_DEF_0"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "###header"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": " "},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "plain"},
+ ]
+ nodes = self.p._convert_list_block_nodes(tokens)
+ self.assertEqual(["text", "header", "text"], [n.name for n in nodes])
+
+ def test_convert_list_block_nodes_empty(self):
+ self.assertEqual([], self.p._convert_list_block_nodes([]))
+
+ def test_heading_from_inline_fallback_branch(self):
+ class _NoHeading(MdParser):
+ def __init__(self):
+ super().__init__()
+ self._markdown = lambda _: [{"type": "paragraph", "children": []}] # noqa: E731
+
+ p = _NoHeading()
+ heading = p._heading_from_inline([{"type": "text", "raw": "###header"}])
+ self.assertEqual("header", heading.name)
+ self.assertEqual("text", heading.nodes[0].name)
+
+ def test_convert_paragraph_or_heading_ref_and_heading(self):
+ self.p._set_reference_definitions({"SDIFF_REF_DEF_0": "[id]: https://example.com"})
+ node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "SDIFF_REF_DEF_0"}])
+ self.assertEqual("paragraph", node.name)
+
+ node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "###header"}])
+ self.assertEqual("header", node.name)
+
+ node = self.p._convert_paragraph_token([{"type": "text", "raw": "###header"}])[0]
+ self.assertEqual("header", node.name)
+
+ def test_split_paragraph_inline_on_fence_variants(self):
+ self.assertIsNone(self.p._split_paragraph_inline_on_fence([]))
+ self.assertIsNone(self.p._split_paragraph_inline_on_fence([{"type": "text", "raw": "x"}]))
+
+ # First line is a fence-only marker => do not split.
+ tokens = [{"type": "text", "raw": "```"}, {"type": "softbreak"}, {"type": "text", "raw": "x"}]
+ self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens))
+
+ # Tail is fence markers but not a complete fence block => do not split.
+ tokens = [
+ {"type": "text", "raw": "a"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "```"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "```"},
+ ]
+ self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens))
+
+ # Complete fence block tail => split.
+ tokens = [
+ {"type": "text", "raw": "a"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "```"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "code"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "```"},
+ ]
+ parts = self.p._split_paragraph_inline_on_fence(tokens)
+ self.assertEqual(2, len(parts))
+
+ nodes = self.p._convert_paragraph_token(tokens)
+ self.assertEqual(2, len(nodes))
+
+ def test_split_paragraph_inline_on_fence_first_part_includes_seps(self):
+ tokens = [
+ {"type": "text", "raw": "a"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "b"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "```"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "code"},
+ {"type": "softbreak"},
+ {"type": "text", "raw": "```"},
+ ]
+ parts = self.p._split_paragraph_inline_on_fence(tokens)
+ self.assertEqual(2, len(parts))
+
+ def test_convert_list_item_block_html_text_smoke(self):
+ # Exercise conversion of text following a (hypothetical) HTML block inside a list item.
+ nodes = self.p._convert_list_item_block_html_text("text\n\n# h\n\n- a\n")
+ self.assertTrue(any(n.name == "header" for n in nodes))
+ self.assertTrue(any(n.name == "list" for n in nodes))
+
+ def test_convert_list_item_with_block_html_child(self):
+ token = {
+ "type": "list_item",
+ "children": [
+ {"type": "block_html", "raw": "hi
"},
+ ],
+ }
+ item = self.p._convert_list_item(token)
+ self.assertTrue(item.nodes)
+
+ def test_convert_list_item_block_html_variants(self):
+ self.assertEqual([], self.p._convert_list_item_block_html({"type": "block_html", "raw": " "}))
+
+ nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "not html\n"})
+ self.assertTrue(nodes)
+
+ nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "hi
\n\n "})
+ self.assertTrue(nodes)
+
+ def test_convert_list_item_block_html_text_with_block_html_and_raw(self):
+ nodes = self.p._convert_list_item_block_html_text("hi
\n\n---\n")
+ self.assertTrue(any(n.name == "text" for n in nodes))
+
+ def test_convert_list_item_block_html_smoke(self):
+ token = {"type": "block_html", "raw": "hi
\n\ntext"}
+ nodes = self.p._convert_list_item_block_html(token)
+ self.assertTrue(any(isinstance(n, Text) for n in nodes))
+
+ def test_rendering_roundtrip_smoke(self):
+ md = "some text [link](url) new text"
+ tree = parser.parse(md, parser_cls=MdParser)
+ self.assertEqual(md, TextRenderer().render(tree))
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 498c070..965e8f5 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,6 +1,7 @@
from unittest import TestCase
from sdiff import parser, MdParser, ZendeskHelpMdParser
-from sdiff.model import ZendeskHelpSteps
+from sdiff.model import Paragraph, Root, Text, ZendeskHelpSteps
+from sdiff.renderer import TextRenderer
class ParserTestCase(TestCase):
@@ -28,9 +29,13 @@ def test_header_in_list(self):
def test_link(self):
self._run_and_assert('[link](url)', 'pa')
+ actual = self._parse('[link](url)')
+ self.assertEqual('[link](url)', actual.nodes[0].nodes[0].text)
def test_image(self):
self._run_and_assert('![Alt text][url/to/image]', 'pi')
+ actual = self._parse('![Alt text][url/to/image]')
+ self.assertEqual('![Alt text][url/to/image]', actual.nodes[0].nodes[0].text)
def test_broken_link_space(self):
self._run_and_assert('[link] (http://www.google.com)', 'pt')
@@ -65,9 +70,154 @@ def test_heading_text(self):
actual = self._parse('### heading')
self.assertEqual('heading', actual.nodes[0].nodes[0].text)
+ def test_heading_without_space_followed_by_text_parses_as_header(self):
+ actual = self._parse('##Heading\ntext')
+ self.assertEqual('2tpt', actual.print_all())
+
+ def test_heading_without_space_with_link_parses_as_header(self):
+ actual = self._parse('##[Verify email]({{url}})\ntext')
+ self.assertEqual('header', actual.nodes[0].name)
+ self.assertEqual(2, actual.nodes[0].level)
+ self.assertEqual('link', actual.nodes[0].nodes[0].name)
+ self.assertEqual('[Verify email]({{url}})', actual.nodes[0].nodes[0].text)
+
+ def test_heading_without_space_in_list_item_followed_by_text(self):
+ actual = self._parse('1. ##Heading\n text')
+ self.assertEqual('lm2tt', actual.print_all())
+
def test_link_wrapped_in_text(self):
self._run_and_assert('some text [link](url) new text', 'ptat')
+ def test_link_with_trailing_text_does_not_duplicate_buffer(self):
+ actual = self._parse('some text [link](url) new text')
+ paragraph = actual.nodes[0]
+ self.assertEqual(['text', 'link', 'text'], [node.name for node in paragraph.nodes])
+ self.assertEqual('some text ', paragraph.nodes[0].text)
+ self.assertEqual('[link](url)', paragraph.nodes[1].text)
+ self.assertEqual(' new text', paragraph.nodes[2].text)
+
+ def test_image_with_trailing_text_does_not_duplicate_buffer(self):
+ actual = self._parse('some  new')
+ paragraph = actual.nodes[0]
+ self.assertEqual(['text', 'image', 'text'], [node.name for node in paragraph.nodes])
+ self.assertEqual('some ', paragraph.nodes[0].text)
+ self.assertEqual('', paragraph.nodes[1].text)
+ self.assertEqual(' new', paragraph.nodes[2].text)
+
+ def test_inline_marker_does_not_duplicate_buffer(self):
+ actual = self._parse('some **bold** text')
+ self.assertEqual('some **bold** text', TextRenderer().render(actual))
+
+ def test_inline_linebreak_does_not_duplicate_buffer(self):
+ actual = self._parse('a\\\nb')
+ paragraph = actual.nodes[0]
+ self.assertEqual(['text', 'new-line', 'text'], [node.name for node in paragraph.nodes])
+ self.assertEqual('a', paragraph.nodes[0].text)
+ self.assertEqual('b', paragraph.nodes[2].text)
+
+ def test_text_before_link_not_duplicated(self):
+ actual = self._parse('some text and [link](url)')
+ paragraph = actual.nodes[0]
+ self.assertEqual(['text', 'link'], [node.name for node in paragraph.nodes])
+ self.assertEqual(['some text and '], [node.text for node in paragraph.nodes if node.name == 'text'])
+
+ def test_link_label_with_codespan(self):
+ actual = self._parse('[use `foo`](url)')
+ self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text)
+
+ def test_link_label_with_strong_preserves_markers(self):
+ actual = self._parse('[**bold**](url)')
+ self.assertEqual('[**bold**](url)', actual.nodes[0].nodes[0].text)
+
+ def test_link_title_preserved(self):
+ actual = self._parse('[label](https://example.com "Title Here")')
+ self.assertEqual('[label](https://example.com "Title Here")', actual.nodes[0].nodes[0].text)
+
+ def test_image_title_preserved(self):
+ actual = self._parse('')
+ self.assertEqual('', actual.nodes[0].nodes[0].text)
+
+ def test_reference_definition_preserved(self):
+ data = 'See [API][id].\n\n[id]: https://example.com'
+ tree = self._parse(data)
+ link = next(node for node in tree.nodes[0].nodes if node.name == 'link')
+ self.assertEqual('[API][id]', link.text)
+ self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text)
+
+ def test_reference_definition_inside_list_item_preserved(self):
+ data = '- item\n [id]: https://example.com'
+ tree = self._parse(data)
+ list_item = tree.nodes[0].nodes[0]
+ self.assertEqual('item', list_item.nodes[0].text)
+ self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text)
+
+ def test_reference_links_with_whitespace_and_empty_id(self):
+ data = 'See [API][] and [Ref] [id].\n\n[API]: https://example.com\n[id]: https://example.com'
+ tree = self._parse(data)
+ link_texts = [node.text for node in tree.nodes[0].nodes if node.name == 'link']
+ self.assertIn('[API][]', link_texts)
+ self.assertIn('[Ref] [id]', link_texts)
+
+ def test_reference_definition_inside_fence_is_text(self):
+ data = """```
+[id]: https://example.com
+[link][id]
+```"""
+ tree = self._parse(data)
+ self.assertEqual('ptttptattt', tree.print_all())
+
+ def test_reference_definition_inside_long_fence_is_text(self):
+ data = """````
+[id]: https://example.com
+[link][id]
+````"""
+ tree = self._parse(data)
+ self.assertEqual('pttttptatttt', tree.print_all())
+
+ def test_softbreak_preserves_space(self):
+ actual = self._parse('hello\nworld')
+ self.assertEqual('hello world', actual.nodes[0].nodes[0].text)
+
+ def test_block_quote_preserves_marker(self):
+ actual = self._parse('> quote')
+ self.assertEqual('> quote', actual.nodes[0].nodes[0].text)
+
+ def test_fenced_code_preserves_fences(self):
+ actual = self._parse('```\ncode\n```')
+ self.assertEqual('ptttttt', actual.print_all())
+ text = ''.join(node.text for node in actual.nodes[0].nodes)
+ self.assertTrue(text.startswith('```'))
+ self.assertTrue(text.endswith('```'))
+
+ def test_ordered_list_parses_as_ordered(self):
+ tree = self._parse('1. one\n2. two')
+ list_node = tree.nodes[0]
+ self.assertTrue(list_node.ordered)
+
+ def test_ordered_list_marker_other_than_1_interrupts_paragraph(self):
+ self._run_and_assert('para\n2. item\n', 'ptlmt')
+
+ def test_list_item_allows_unindented_heading_lazy_continuation(self):
+ tree = self._parse('* a\n###### b\n')
+ self.assertEqual(1, len(tree.nodes))
+ self.assertEqual('list', tree.nodes[0].name)
+ item = tree.nodes[0].nodes[0]
+ self.assertEqual(['text', 'header'], [node.name for node in item.nodes])
+ self.assertEqual('a', item.nodes[0].text)
+ self.assertEqual(6, item.nodes[1].level)
+ self.assertEqual('b', item.nodes[1].nodes[0].text)
+
+ def test_unordered_list_parses_as_unordered(self):
+ tree = self._parse('- one\n- two')
+ list_node = tree.nodes[0]
+ self.assertFalse(list_node.ordered)
+
+ def test_double_blank_lines_between_list_items_nests_next_list(self):
+ self._run_and_assert('* a\n\n\n* b\n', 'lmtlmt')
+
+ def test_double_blank_lines_between_ordered_list_items_nests_next_list(self):
+ self._run_and_assert('1. a\n\n\n1. b\n', 'lmtlmt')
+
class TestZendeskParser(ParserTestCase):
def setUp(self) -> None:
@@ -103,6 +253,22 @@ def test_callout_invalid_style(self):
actual = self._parse(fixture)
self.assertNotEqual(actual.nodes[0].name, 'callout')
+ def test_callout_invalid_style_does_not_swallow_trailing_closing_tag(self):
+ fixture = '\n# title\ncontent\n\n\n'
+ self._run_and_assert(fixture, 'xpt')
+
+ def test_callout_tags_inside_list_item_are_text_and_allow_headings(self):
+ fixture = '1. item\n\n# title\ncontent\n\n'
+ tree = self._parse(fixture)
+ self.assertEqual(1, len(tree.nodes))
+ self.assertEqual('list', tree.nodes[0].name)
+ item = tree.nodes[0].nodes[0]
+ self.assertEqual(['text', 'text', 'header', 'text', 'text'], [node.name for node in item.nodes])
+ self.assertEqual('<callout>', item.nodes[1].text)
+ self.assertEqual(1, item.nodes[2].level)
+ self.assertEqual('title', item.nodes[2].nodes[0].text)
+ self.assertEqual('</callout>', item.nodes[-1].text)
+
def test_tabs(self):
fixture = """
@@ -114,6 +280,48 @@ def test_tabs(self):
"""
self._run_and_assert(fixture, 'T1tpt1tpt')
+ def test_inline_callout_is_not_structural(self):
+ fixture = """intro
+# title
+content
+ outro"""
+ self._run_and_assert(fixture, 'pt1tpt')
+
+ def test_zendesk_tags_inside_fenced_code_are_text(self):
+ fixture = """```
+
+# title
+content
+
+
+1. one
+
+
+# tab
+content
+
+```"""
+ tree = self._parse(fixture)
+ self.assertEqual('ptttxxxpttt', tree.print_all())
+ self.assertFalse(any(node.name in {'callout', 'steps', 'tabs'} for node in tree.nodes))
+
+ def test_zendesk_tags_after_fenced_code_are_parsed(self):
+ fixture = """```
+
+# title
+content
+
+```
+
+
+# title
+content
+
+"""
+ tree = self._parse(fixture)
+ self.assertTrue(any(node.name == 'callout' for node in tree.nodes))
+ self.assertEqual(1, tree.print_all().count('C'))
+
def test_steps(self):
steps_fixture = """
@@ -166,3 +374,33 @@ def test_leave_spaces_with_text(self):
text = 'test \n test'
actual = parser._remove_spaces_from_empty_lines(text)
self.assertEqual(text, actual)
+
+ def test_remove_ltr_rtl_marks(self):
+ text = 'a\u200eb\u200f'
+ actual = parser._remove_ltr_rtl_marks(text)
+ self.assertEqual('ab', actual)
+
+
+class DummyParser:
+ last_text = None
+
+ def parse(self, text, rules=None):
+ DummyParser.last_text = text
+ return [Paragraph([Text(text)])]
+
+
+class TestParseWrapper(TestCase):
+ def test_wraps_list_parser_output(self):
+ tree = parser.parse('hello', parser_cls=DummyParser)
+ self.assertIsInstance(tree, Root)
+ self.assertEqual('pt', tree.print_all())
+
+ def test_custom_parser_input_not_mutated_by_ref_defs(self):
+ data = 'See [API][id].\n\n[id]: https://example.com'
+ parser.parse(data, parser_cls=DummyParser)
+ self.assertIn('[id]: https://example.com', DummyParser.last_text)
+
+ def test_mdparser_parse_accepts_rules_argument(self):
+ md_parser = MdParser()
+ nodes = md_parser.parse('1. one', MdParser.list_rules)
+ self.assertIsInstance(nodes, list)
diff --git a/tests/test_sdiff.py b/tests/test_sdiff.py
index a132509..55d3079 100644
--- a/tests/test_sdiff.py
+++ b/tests/test_sdiff.py
@@ -8,7 +8,7 @@
def _load_fixture(*path):
- return open(os.path.join('tests/fixtures', *path)).read()
+ return open(os.path.join('tests/fixtures', *path), encoding='utf-8').read()
def _read_test_files(dirpath):
@@ -36,3 +36,76 @@ def test_different(self):
_, _, errors = sdiff.diff(_load_fixture('different', path1), _load_fixture('different', path2),
parser_cls=ZendeskHelpMdParser)
self.assertNotEqual([], errors, msg=case)
+
+ def test_ignores_link_content(self):
+ left = '[Link](http://example.com)'
+ right = '[Different](http://example.org)'
+ _, _, errors = sdiff.diff(left, right)
+ self.assertEqual([], errors)
+
+ def test_missing_link_is_reported(self):
+ left = 'text [Link](http://example.com)'
+ right = 'text'
+ tree1 = sdiff.parse(left)
+ tree2 = sdiff.parse(right)
+ _, _, errors = sdiff.diff_links(tree1, tree2)
+ self.assertTrue(any(error.node.name == 'link' for error in errors))
+
+ def test_extra_paragraph_has_paragraph_error(self):
+ left = _load_fixture('different', 'extra_paragraph.en.md')
+ right = _load_fixture('different', 'extra_paragraph.de.md')
+ _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser)
+ self.assertTrue(any(error.node.name == 'paragraph' for error in errors))
+
+ def test_softbreaks_ignored_in_structure(self):
+ left = 'hello\nworld'
+ right = 'hello world'
+ _, _, errors = sdiff.diff(left, right)
+ self.assertEqual([], errors)
+
+ def test_heading_without_space_matches_heading_with_space(self):
+ left = '##Heading\ntext'
+ right = '## Heading\ntext'
+ _, _, errors = sdiff.diff(left, right)
+ self.assertEqual([], errors)
+
+ def test_list_heading_without_space_matches_heading_with_space(self):
+ left = '1. ##Heading\n text'
+ right = '1. ## Heading\n text'
+ _, _, errors = sdiff.diff(left, right)
+ self.assertEqual([], errors)
+
+ def test_reference_definition_missing_is_reported(self):
+ left = 'See [API][id].\n\n[id]: https://example.com'
+ right = 'See [API][id].'
+ _, _, errors = sdiff.diff(left, right)
+ self.assertTrue(any(error.node.name == 'paragraph' for error in errors))
+
+ def test_code_block_content_ignored_in_structure(self):
+ left = """```
+code sample
+```"""
+ right = """```
+different code sample
+```"""
+ _, _, errors = sdiff.diff(left, right)
+ self.assertEqual([], errors)
+
+ def test_invalid_callout_followed_by_fence_does_not_depend_on_blank_line(self):
+ left = """
+# title
+content
+
+
+```
+code
+```"""
+ right = """
+# title
+content
+
+```
+code
+```"""
+ _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser)
+ self.assertEqual([], errors)
diff --git a/tests/test_tree_utils.py b/tests/test_tree_utils.py
index ff8a226..dc4f0ab 100644
--- a/tests/test_tree_utils.py
+++ b/tests/test_tree_utils.py
@@ -1,7 +1,62 @@
from unittest import TestCase
+from sdiff.model import Header, Link, Paragraph, Root, Text
+from sdiff.tree_utils import traverse
+
class TestTraverse(TestCase):
- def test_name(self):
- pass
+ def test_preorder_traversal(self):
+ tree = Root([
+ Paragraph([
+ Text('one'),
+ Link('link'),
+ ]),
+ Header(2, [
+ Text('heading'),
+ ]),
+ ])
+ symbols = [node.symbol for node in traverse(tree)]
+ self.assertEqual(['p', 't', 'a', 'h', 't'], symbols)
+
+ def test_consecutive_text_nodes_coalesced(self):
+ tree = Root([
+ Paragraph([
+ Text('one'),
+ Text('two'),
+ Link('link'),
+ Text('three'),
+ Text('four'),
+ ]),
+ ])
+ texts = [node.text for node in traverse(tree) if isinstance(node, Text)]
+ self.assertEqual(['one', 'three'], texts)
+
+ def test_exclude_symbols_prunes_children(self):
+ tree = Root([
+ Paragraph([
+ Text('one'),
+ Link('link'),
+ ]),
+ ])
+ symbols = [node.symbol for node in traverse(tree, exclude_symbols=['a'])]
+ self.assertEqual(['p', 't'], symbols)
+
+ def test_include_symbols_filters_children(self):
+ tree = Root([
+ Paragraph([
+ Text('one'),
+ Link('link'),
+ ]),
+ ])
+ symbols = [node.symbol for node in traverse(tree, include_symbols=['a'])]
+ self.assertEqual(['p', 'a'], symbols)
+
+ def test_include_exclude_conflict_excludes(self):
+ tree = Root([
+ Paragraph([
+ Link('link'),
+ ]),
+ ])
+ symbols = [node.symbol for node in traverse(tree, include_symbols=['a'], exclude_symbols=['a'])]
+ self.assertEqual(['p'], symbols)