From bb88b2cd49bdaf2723804473b02e79c63cf45f8a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Jul 2022 22:35:03 +0000 Subject: [PATCH 01/18] Bump mistune from 0.8.1 to 2.0.3 Bumps [mistune](https://github.com/lepture/mistune) from 0.8.1 to 2.0.3. - [Release notes](https://github.com/lepture/mistune/releases) - [Changelog](https://github.com/lepture/mistune/blob/master/docs/changes.rst) - [Commits](https://github.com/lepture/mistune/compare/v0.8.1...v2.0.3) --- updated-dependencies: - dependency-name: mistune dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 1f202e5..c9e10bf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -mistune==0.8.1 +mistune==2.0.3 diff --git a/setup.py b/setup.py index 2c28d23..088053b 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read(f): install_requires = [ - 'mistune <= 1', + 'mistune < 3', ] tests_require = [ From 6ef989e7c48e381bbcf63c70c5c1c036ea230877 Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Thu, 15 Jan 2026 10:15:23 -0800 Subject: [PATCH 02/18] =?UTF-8?q?=F0=9F=93=9D=20docs:=20expand=20README=20?= =?UTF-8?q?and=20add=20repository=20guidelines?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- AGENTS.md | 28 ++++++++++++++++++++++++++++ README.md | 40 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..bb38229 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,28 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tests are in `tests/`, with shared fixtures in `tests/fixtures/`. Reference PDFs sit in `docs/`. Packaging and tooling are defined in `setup.py`, `setup.cfg`, and the `Makefile`; `CHANGELOG` tracks releases. + +## Build, Test, and Development Commands +- `make env` creates the local `venv/` (Python 3.11+). +- `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv. +- `make test` runs linting and the full pytest suite with coverage. +- `make vtest` runs pytest verbosely. +- `make flake` runs flake8 on `sdiff/` and `tests/`. +- `make cov` prints the coverage report. +- `make clean` removes build artifacts and the venv. + +Example flow: +```sh +make dev +make test +``` + +## Coding Style & Naming Conventions +Use standard Python conventions: 4-space indentation, `snake_case` for modules/functions/variables, and `PascalCase` for classes. Flake8 enforces a 120-character line limit (see `setup.cfg`). `autopep8` is available for formatting. Keep new modules in `sdiff/` and new tests in `tests/` with filenames like `test_.py`. + +## Testing Guidelines +The suite uses `pytest` with `coverage`. Coverage is expected to stay high (current config fails under 96%). Add or update tests for behavior changes, and prefer small, focused unit tests. Place reusable data in `tests/fixtures/`. Run `make test` before submitting changes. + +## Commit & Pull Request Guidelines +Commit messages in this repo are short and often use a type prefix (e.g., `chore: ...`, `fixes: ...`, `hotfix: ...`, `refactors: ...`). Follow that pattern where practical, and keep the summary concise. For PRs, include a brief description, list tests run (e.g., `make test`), and link related issues or tickets when available. diff --git a/README.md b/README.md index b8bb2a8..7ab5d32 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ # md-sdiff -Diffs to markdown texts only based on their structure. Ignores content. Helpful to diff 2 files that contain the same content in different languages. + +Structural diffs for Markdown. The library parses two Markdown inputs into a lightweight tree and compares the *shape* (headings, lists, paragraphs, links, etc.) instead of the text content. This is useful when you expect the same document structure across translations or when you want to validate formatting consistency without caring about the wording. + +## What it does +- Parses Markdown into an AST-like node tree using `mistune`. +- Compares trees node-by-node and flags insertions/deletions in structure. +- Returns a rendered view of each document plus a list of structural errors. +- Supports a Zendesk-specific parser (`ZendeskHelpMdParser`) for ``, ``, and `` blocks. + +## Example usage +```python +from sdiff import diff, TextRenderer, MdParser + +left = "# Title\n\n- One\n- Two" +right = "# Title\n\n- One\n- Two\n- Three" + +rendered_left, rendered_right, errors = diff(left, right, renderer=TextRenderer(), parser_cls=MdParser) +print(errors[0]) # "There is a missing element `li`." +``` + +## Renderers +`TextRenderer` returns the original Markdown structure as text. `HtmlRenderer` wraps the output and marks structural insertions/deletions with `` and ``. + +## One-off usage +```sh +python - <<'PY' +from sdiff import diff, TextRenderer + +left = open("left.md", "r", encoding="utf-8").read() +right = open("right.md", "r", encoding="utf-8").read() +_, _, errors = diff(left, right, renderer=TextRenderer()) + +for err in errors: + print(err) +PY +``` + +## Notes +This project is a library (no CLI). If you need different token handling, you can provide a custom parser class that extends `MdParser`. From bf349a49639560336b929f8d063b2e2137380e5d Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:37:39 -0800 Subject: [PATCH 03/18] =?UTF-8?q?=F0=9F=A7=B0=20chore(ci):=20switch=20to?= =?UTF-8?q?=20GitHub=20Actions=20and=20add=20lint=20hooks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .flake8 | 12 ++++++++++++ .github/workflows/ci.yml | 34 ++++++++++++++++++++++++++++++++++ .husky/pre-commit | 4 ++++ .travis.yml | 11 ----------- package.json | 10 ++++++++++ 5 files changed, 60 insertions(+), 11 deletions(-) create mode 100644 .flake8 create mode 100644 .github/workflows/ci.yml create mode 100755 .husky/pre-commit delete mode 100644 .travis.yml create mode 100644 package.json diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..aac8e09 --- /dev/null +++ b/.flake8 @@ -0,0 +1,12 @@ +[flake8] +max-line-length = 120 +max-complexity = 12 +select = E,F,W,C90 +extend-ignore = F403,F405 +exclude = + .git, + __pycache__, + venv, + build, + dist, + sdiff.egg-info diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..484ab83 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,34 @@ +name: CI + +on: + workflow_dispatch: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + push: + branches: [master] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[tests] + + - name: Lint + run: python -m flake8 --config .flake8 sdiff tests + + - name: Test + run: python -m coverage run -m pytest -s --durations=3 --durations-min=0.005 + + - name: Coverage report + run: python -m coverage report -m diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100755 index 0000000..b011f88 --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1,4 @@ +#!/usr/bin/env sh +. "$(dirname -- "$0")/_/husky.sh" + +python -m flake8 --config .flake8 sdiff tests diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index df31221..0000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: python -dist: jammy -python: - - "3.11" -# command to install dependencies -install: - - make dev -# command to run tests -script: - - make test - - make coverage diff --git a/package.json b/package.json new file mode 100644 index 0000000..d682872 --- /dev/null +++ b/package.json @@ -0,0 +1,10 @@ +{ + "name": "html-structure-diff", + "private": true, + "devDependencies": { + "husky": "^9.0.0" + }, + "scripts": { + "prepare": "husky install" + } +} From 32f9a779da5ee46305ea10c89dff9334680e8603 Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:38:18 -0800 Subject: [PATCH 04/18] =?UTF-8?q?=F0=9F=9B=A0=EF=B8=8F=20fix(parser):=20up?= =?UTF-8?q?date=20mistune=203=20parsing=20and=20tests?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 2 +- sdiff/__init__.py | 5 +- sdiff/compare.py | 4 +- sdiff/parser.py | 532 +++++++++++++++++++++++++-------------- setup.py | 2 +- tests/test_compare.py | 12 +- tests/test_parser.py | 49 ++++ tests/test_sdiff.py | 42 +++- tests/test_tree_utils.py | 59 ++++- 9 files changed, 502 insertions(+), 205 deletions(-) diff --git a/requirements.txt b/requirements.txt index c9e10bf..a234623 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -mistune==2.0.3 +mistune==3.2.0 diff --git a/sdiff/__init__.py b/sdiff/__init__.py index 853d12c..17319a9 100644 --- a/sdiff/__init__.py +++ b/sdiff/__init__.py @@ -8,9 +8,8 @@ def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParse tree2 = parse(md2, parser_cls) tree1, tree2, struct_errors = diff_struct(tree1, tree2) - # tree1, tree2, links_errors = diff_links(tree1, tree2) + tree1, tree2, links_errors = diff_links(tree1, tree2) - # errors = struct_errors + links_errors - errors = struct_errors + errors = struct_errors + links_errors return renderer.render(tree1), renderer.render(tree2), errors diff --git a/sdiff/compare.py b/sdiff/compare.py index 5958ada..5d4d19f 100644 --- a/sdiff/compare.py +++ b/sdiff/compare.py @@ -44,7 +44,9 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None): def diff_links(tree1, tree2): - return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a']) + tree1, tree2, errors = _diff(tree1, tree2, exclude_symbols=['t', 'i']) + link_errors = [error for error in errors if error.node.symbol == 'a'] + return tree1, tree2, link_errors def diff_struct(tree1, tree2): diff --git a/sdiff/parser.py b/sdiff/parser.py index 93a4736..ad59b93 100644 --- a/sdiff/parser.py +++ b/sdiff/parser.py @@ -1,207 +1,351 @@ -from re import Match - -import mistune import re +import textwrap +from typing import Iterable -from .model import * - - -class InlineLexer(mistune.BlockLexer): - grammar_class = mistune.InlineGrammar - - default_rules = [ - 'linebreak', 'link', - 'reflink', 'text', - ] - - def __init__(self): - self.links = {} - self.grammar_class.text = re.compile(r'^ {1,}\n|^[\s\S]+?(?=[\[`~]| {2,}\n|$)') - super().__init__() - - def parse_autolink(self, m): - self.tokens.append(Link(m.group(0))) - - def parse_url(self, m): - self.tokens.append(Link(m.group(0))) - - def parse_link(self, m): - return self._process_link(m) - - def parse_reflink(self, m): - # TODO skip this check for now - # key = mistune._keyify(m.group(2) or m.group(1)) - # if key not in self.links: - # return None - # ret = self.links[key] - return self._process_link(m) - - def _process_link(self, m): - line = m.group(0) - if line[0] == '!': - node = Image(line) - else: - node = Link(line) - - self.tokens.append(node) - - def parse_linebreak(self, m): - node = NewLine() - self.tokens.append(node) +import mistune +from mistune import block_parser - def parse_text(self, m): - text = m.group(0) - if text.strip(): - escaped_text = mistune.escape(text) - node = Text(escaped_text) - self.tokens.append(node) +from .model import (Html, Image, Link, List, ListItem, NewLine, Paragraph, Root, + Text, Header, ZendeskHelpCallout, ZendeskHelpSteps, + ZendeskHelpTabs) +_BLOCK_TAGS = {tag.lower() for tag in block_parser.BLOCK_TAGS} +_HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)') +_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\[[^\]]+\]') +_REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+') +_FENCE_RE = re.compile(r'^\s*(```|~~~)') -class MdParser(mistune.BlockLexer): - default_rules = [ - 'newline', 'list_block', 'block_html', - 'heading', 'lheading', - 'paragraph', 'text', - ] - - list_rules = ( - 'newline', 'heading', 'lheading', - 'hrule', 'list_block', 'text', - ) +class MdParser: @classmethod def get_lexer(cls): return cls() def __init__(self): - super().__init__() - self.grammar_class.block_html = re.compile( - r'^\s* *(?:{}|{}|{}) *(?:\n{{1,}}|\s*$)'.format( - r'', - r'<({})((?:{})*?)>([\s\S]+?)<\/\1>'.format(mistune._block_tag, mistune._valid_attr), - r'<{}(?:{})*?>'.format(mistune._block_tag, mistune._valid_attr), - ) - ) - - def _parse_inline(self, text): - inline = InlineLexer() - return inline.parse(text) - - def parse_newline(self, m): - length = len(m.group(0)) - if length > 1: - self.tokens.append(NewLine()) - - def parse_heading(self, m): - level = len(m.group(1)) - node = Header(level) - node.add_nodes(self._parse_inline(m.group(2))) - self.tokens.append(node) - - def parse_lheading(self, m): - level = 1 if m.group(2) == '=' else 2 - text = m.group(1) - node = Header(level) - node.add_nodes(self._parse_inline(text)) - self.tokens.append(node) - - def parse_block_html(self, m): - text = m.group(0) - html = Html(text) - self.tokens.append(html) - - def parse_paragraph(self, m): - text = m.group(1).rstrip('\n') - node = Paragraph() - node.add_nodes(self._parse_inline(text)) - self.tokens.append(node) - - def parse_text(self, m): - text = m.group(0) - escaped_text = mistune.escape(text) - node = Text(escaped_text) - self.tokens.append(node) - - def parse_list_block(self, m): - bull = m.group(2) - cap = m.group(0) - ordered = '.' in bull - node = List(ordered) - node.add_nodes(self._process_list_item(cap, bull)) - self.tokens.append(node) - - def _process_list_item(self, cap, bull): - result = [] - cap = self.rules.list_item.findall(cap) - - _next = False - length = len(cap) - - for i in range(length): - item = cap[i][0] - - # remove the bullet - space = len(item) - item = self.rules.list_bullet.sub('', item) - - # outdent - if '\n ' in item: - space = space - len(item) - pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) - item = pattern.sub('', item) - - # determine whether item is loose or not - loose = _next - if not loose and re.search(r'\n\n(?!\s*$)', item): - loose = True - - rest = len(item) - if i != length - 1 and rest: - _next = item[rest - 1] == '\n' - if not loose: - loose = _next - - node = ListItem() - block_lexer = self.get_lexer() - nodes = block_lexer.parse(item, self.list_rules) - node.add_nodes(nodes) - result.append(node) - return result + self._markdown = mistune.create_markdown(renderer='ast') + self._reference_definitions = {} + + def parse(self, text): + tokens = self._markdown(text) + return Root(self._convert_block_tokens(tokens)) + + def _set_reference_definitions(self, definitions): + self._reference_definitions = definitions + + def _convert_block_tokens(self, tokens: Iterable[dict]): + nodes = [] + for token in tokens: + nodes.extend(self._convert_block_token(token)) + return nodes + + def _convert_block_token(self, token): + token_type = token.get('type') + if token_type == 'paragraph': + return [self._convert_paragraph_or_heading(token.get('children', []))] + if token_type == 'heading': + return [self._convert_heading(token)] + if token_type == 'list': + return [self._convert_list(token)] + if token_type == 'list_item': + return [self._convert_list_item(token)] + if token_type == 'block_text': + return [self._convert_paragraph_or_heading(token.get('children', []))] + if token_type == 'block_html': + return self._convert_block_html(token) + if token_type in {'thematic_break', 'block_quote', 'block_code', 'fenced_code'}: + return self._convert_passthrough_block(token) + return self._convert_passthrough_block(token) + + def _convert_heading(self, token): + level = token.get('level') or token.get('attrs', {}).get('level', 1) + header = Header(level) + header.add_nodes(self._convert_inline_tokens(token.get('children', []))) + return header + + def _convert_list(self, token): + ordered = token.get('ordered') + if ordered is None: + ordered = token.get('attrs', {}).get('ordered', False) + list_node = List(bool(ordered)) + for item in token.get('children', []): + list_node.add_node(self._convert_list_item(item)) + return list_node + + def _convert_block_html(self, token): + raw = token.get('raw', '') + if _is_block_html(raw): + return [Html(raw)] + text = mistune.escape(raw) + if text.strip(): + return [Paragraph([Text(text)])] + return [] + + def _convert_passthrough_block(self, token): + child_nodes = self._convert_block_tokens(token.get('children', [])) + if child_nodes: + return child_nodes + raw = token.get('raw') or token.get('text') or '' + if raw.strip(): + return [Paragraph([Text(mistune.escape(raw))])] + return [] + + def _convert_list_item(self, token): + item = ListItem() + for child in token.get('children', []): + child_type = child.get('type') + if child_type in {'block_text', 'paragraph'}: + item.add_nodes(self._convert_list_block_nodes(child.get('children', []))) + else: + item.add_nodes(self._convert_block_tokens([child])) + return item + + def _convert_inline_tokens(self, tokens: Iterable[dict]): + nodes = [] + buffer = '' + + def flush_buffer(): + nonlocal buffer + if buffer: + self._split_reference_links(buffer, nodes) + buffer = '' + + for token in tokens: + token_type = token.get('type') + if token_type in {'text', 'inline_html', 'block_html'}: + buffer += token.get('raw', '') + elif token_type == 'codespan': + buffer += f"`{token.get('raw') or token.get('text') or ''}`" + elif token_type == 'softbreak': + buffer += ' ' + elif token_type == 'linebreak': + flush_buffer() + nodes.append(NewLine()) + elif token_type == 'link': + flush_buffer() + text = self._flatten_inline_text(token.get('children', [])) + url = token.get('attrs', {}).get('url', '') + nodes.append(Link(f"[{text}]({url})")) + elif token_type == 'image': + flush_buffer() + alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', [])) + url = token.get('attrs', {}).get('url', '') + nodes.append(Image(f"![{alt}]({url})")) + else: + flush_buffer() + children = token.get('children', []) + if children: + nodes.extend(self._convert_inline_tokens(children)) + else: + raw = token.get('raw') or token.get('text') or '' + if raw.strip(): + _append_text(nodes, mistune.escape(raw)) + + flush_buffer() + return nodes + + def _flatten_inline_text(self, tokens: Iterable[dict]): + parts = [] + for token in tokens: + token_type = token.get('type') + if token_type in {'text', 'inline_html', 'block_html'}: + parts.append(token.get('raw') or token.get('text') or '') + elif token_type == 'codespan': + parts.append(f"`{token.get('raw') or token.get('text') or ''}`") + elif token_type in {'linebreak', 'softbreak'}: + parts.append(' ') + else: + children = token.get('children', []) + if children: + parts.append(self._flatten_inline_text(children)) + else: + parts.append(token.get('raw') or token.get('text') or '') + return ''.join(parts).strip() + + def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]): + ref_text = self._reference_definition_text(inline_tokens) + if ref_text is not None: + return Paragraph([Text(ref_text)]) + heading = self._heading_from_inline(inline_tokens) + if heading: + return heading + return Paragraph(self._convert_inline_tokens(inline_tokens)) + + def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]): + heading = self._heading_from_inline(inline_tokens) + if heading: + return [heading] + return self._convert_inline_tokens(inline_tokens) + + def _heading_from_inline(self, inline_tokens: Iterable[dict]): + if len(inline_tokens) != 1: + return None + token = inline_tokens[0] + if token.get('type') != 'text': + return None + raw = token.get('raw', '') + match = _HEADING_LINE_RE.match(raw) + if not match: + return None + level = len(match.group(2)) + content = raw[match.end(2):].lstrip() + heading_tokens = self._markdown(f"{'#' * level} {content}") + if heading_tokens and heading_tokens[0].get('type') == 'heading': + children = heading_tokens[0].get('children', []) + else: + children = [{'type': 'text', 'raw': content}] + header = Header(level) + header.add_nodes(self._convert_inline_tokens(children)) + return header + + def _reference_definition_text(self, inline_tokens: Iterable[dict]): + if len(inline_tokens) != 1: + return None + token = inline_tokens[0] + if token.get('type') != 'text': + return None + raw = token.get('raw', '') + return self._reference_definitions.get(raw) + + def _split_reference_links(self, raw: str, nodes): + last = 0 + for match in _REF_LINK_OR_IMAGE_RE.finditer(raw): + if match.start() > last: + _append_text(nodes, mistune.escape(raw[last:match.start()])) + snippet = match.group(0) + if snippet.startswith('!['): + nodes.append(Image(snippet)) + else: + nodes.append(Link(snippet)) + last = match.end() + if last < len(raw): + _append_text(nodes, mistune.escape(raw[last:])) + return nodes class ZendeskHelpMdParser(MdParser): - TAG_CONTENT_GROUP = 'tag_content' - TAG_PATTERN = r'^\s*(<{tag_name}{attr_re}>(?P<%s>[\s\S]+?))\s*$' % TAG_CONTENT_GROUP - CALLOUT_STYLE_GROUP = 'style' - CALLOUT_ATTR_PATTERN = r'( (?P<%s>green|red|yellow))*' % CALLOUT_STYLE_GROUP - - def __init__(self): - super().__init__() - self.grammar_class.callout = re.compile(self.TAG_PATTERN.format(tag_name='callout', - attr_re=self.CALLOUT_ATTR_PATTERN)) - self.default_rules.insert(0, 'callout') - - self.grammar_class.steps = re.compile(self.TAG_PATTERN.format(tag_name='steps', attr_re='')) - self.default_rules.insert(0, 'steps') - - self.grammar_class.tabs = re.compile(self.TAG_PATTERN.format(tag_name='tabs', attr_re='')) - self.default_rules.insert(0, 'tabs') - - def parse_callout(self, m: Match[str]) -> None: - style = m.group(self.CALLOUT_STYLE_GROUP) - self._parse_nested(ZendeskHelpCallout(style), m) - - def parse_steps(self, m: Match[str]) -> None: - self._parse_nested(ZendeskHelpSteps(), m) - - def parse_tabs(self, m: Match[str]) -> None: - self._parse_nested(ZendeskHelpTabs(), m) - - def _parse_nested(self, node: Node, m: Match[str]) -> None: - nested_content = m.group(self.TAG_CONTENT_GROUP) - nested_nodes = self.get_lexer().parse(nested_content) - node.add_nodes(nested_nodes) - self.tokens.append(node) + _CALLOUT_PATTERN = re.compile( + r'(?s)green|red|yellow))?>(?P.*?)' + ) + _STEPS_PATTERN = re.compile(r'(?s)(?P.*?)') + _TABS_PATTERN = re.compile(r'(?s)(?P.*?)') + + def parse(self, text): + nodes = self._parse_nodes(text) + return Root(nodes) + + def _parse_nodes(self, text: str): + nodes = [] + remaining = text + while remaining: + tag_name, match = self._find_next_tag(remaining) + if not match: + nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining))) + break + + if match.start() > 0: + prefix = remaining[:match.start()] + nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix))) + + content = match.group('content') + if tag_name == 'callout': + node = ZendeskHelpCallout(match.group('style')) + elif tag_name == 'steps': + node = ZendeskHelpSteps() + else: + node = ZendeskHelpTabs() + + node.add_nodes(self._parse_nodes(content)) + nodes.append(node) + + remaining = remaining[match.end():] + return nodes + + def _find_next_tag(self, text: str): + matches = [] + for name, pattern in ( + ('callout', self._CALLOUT_PATTERN), + ('steps', self._STEPS_PATTERN), + ('tabs', self._TABS_PATTERN), + ): + match = pattern.search(text) + if match: + matches.append((match.start(), name, match)) + if not matches: + return None, None + _, name, match = min(matches, key=lambda item: item[0]) + return name, match + + def _parse_markdown(self, text: str): + normalized = _remove_spaces_from_empty_lines(text) + normalized = _remove_ltr_rtl_marks(normalized) + return self._convert_block_tokens(self._markdown(normalized)) + + +def _append_text(nodes, text): + if not text: + return + if nodes and isinstance(nodes[-1], Text): + nodes[-1].text += text + else: + nodes.append(Text(text)) + + +def _is_block_html(raw: str) -> bool: + stripped = raw.lstrip() + if stripped.startswith('' + r'|<(' + _LEGACY_BLOCK_TAG_RE + r')((?:' + _LEGACY_VALID_ATTR_RE + r')*?)>([\s\S]+?)<\/\1>' + r'|<' + _LEGACY_BLOCK_TAG_RE + r'(?:' + _LEGACY_VALID_ATTR_RE + r')*?>' + r') *(?:\n{1,}|\s*$)' +) + + +def _split_legacy_block_html(raw: str) -> tuple[str, str] | None: + """Split over-greedy HTML blocks produced by mistune 3. + + Mistune 0.x treats a line like `` as a single HTML block and continues parsing + following Markdown lines. Mistune 3 follows CommonMark and may consume subsequent lines + until a blank line, which changes our structural tree. + """ + if not raw or '\n' not in raw: + return None + match = _LEGACY_BLOCK_HTML_RE.match(raw) + if match is None: + return None + end = match.end() + if end >= len(raw): + return None + return raw[:end], raw[end:] + + +class _SdiffBlockParser(block_parser.BlockParser): + """Mistune block parser tweaked for legacy-compat structure diffs. + + The master branch (mistune 0.x) did not treat fenced code blocks or block quotes + as special blocks. We disable them so they are parsed as normal text and then + normalized in our conversion layer. + """ + + def parse_fenced_code(self, m, state): # noqa: ANN001 + return None + + def parse_block_quote(self, m, state): # noqa: ANN001 + return None + + def parse_raw_html(self, m, state): # noqa: ANN001 + """Parse raw HTML more like mistune 0.x. + + In mistune 3, unknown tags are "type 7" HTML blocks and may not interrupt + paragraphs. The legacy mistune 0.x parser used in `master` treats any + non-inline tag as block HTML and it can interrupt paragraphs. + """ + marker = m.group(0).strip() + + # Legacy parser does not recognize closing tags alone as block HTML. + if marker.startswith(' 1 else None + + def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]): + text = self._flatten_inline_markup(inline_tokens, softbreak_as_newline=True) + if not text or not text.strip(): + return [] + + nodes = [] + for line in text.splitlines(): + if not line.strip(): + continue + + ref_text = self._reference_definitions.get(line) + if ref_text is not None: + nodes.append(Text(ref_text)) + continue + + heading = self._heading_from_inline([{'type': 'text', 'raw': line}]) + if heading: + nodes.append(heading) + continue + + nodes.append(Text(mistune.escape(line))) + + return nodes + + def _flatten_inline_markup(self, tokens: Iterable[dict], *, softbreak_as_newline: bool = False): + parts = [] + for token in tokens: + token_type = token.get('type') + if token_type in {'text', 'inline_html', 'block_html'}: + raw = token.get('raw') or token.get('text') or '' + parts.append(self._reference_definitions.get(raw, raw)) + elif token_type == 'link': + label = self._flatten_inline_markup( + token.get('children', []), + softbreak_as_newline=softbreak_as_newline, + ) + attrs = token.get('attrs', {}) + url = _unquote_url_if_template(attrs.get('url', '')) + title = attrs.get('title') + parts.append(_format_link_markup(label, url, title)) + elif token_type == 'image': + alt = token.get('attrs', {}).get('alt') or self._flatten_inline_markup( + token.get('children', []), + softbreak_as_newline=softbreak_as_newline, + ) + attrs = token.get('attrs', {}) + url = _unquote_url_if_template(attrs.get('url', '')) + title = attrs.get('title') + parts.append(_format_image_markup(alt, url, title)) + elif token_type == 'softbreak': + parts.append('\n' if softbreak_as_newline else ' ') + elif token_type == 'linebreak': + parts.append('\n') + elif token_type == 'codespan': + parts.append(f"`{token.get('raw') or token.get('text') or ''}`") + elif token_type in _INLINE_MARKERS: + marker = _INLINE_MARKERS[token_type] + inner = self._flatten_inline_markup( + token.get('children', []), + softbreak_as_newline=softbreak_as_newline, + ) + parts.append(f'{marker}{inner}{marker}') + else: + children = token.get('children', []) + if children: + parts.append(self._flatten_inline_markup(children, softbreak_as_newline=softbreak_as_newline)) + else: + parts.append(token.get('raw') or token.get('text') or '') + return ''.join(parts) def _heading_from_inline(self, inline_tokens: Iterable[dict]): if len(inline_tokens) != 1: @@ -334,11 +666,12 @@ def _split_reference_links(self, raw: str, nodes): class ZendeskHelpMdParser(MdParser): - _CALLOUT_PATTERN = re.compile( - r'(?s)green|red|yellow))?>(?P.*?)' - ) - _STEPS_PATTERN = re.compile(r'(?s)(?P.*?)') - _TABS_PATTERN = re.compile(r'(?s)(?P.*?)') + _CALLOUT_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*[^>]*)>(?P.*?)') + _CALLOUT_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*[^>]*)>(?P.*)') + _STEPS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*(?P.*?)') + _STEPS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*(?P.*)') + _TABS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*(?P.*?)') + _TABS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*(?P.*)') def parse(self, text, rules=None): """Parse Markdown with Zendesk tag support into a list of Node objects.""" @@ -349,7 +682,22 @@ def _parse_nodes(self, text: str): nodes = [] remaining = text while remaining: - tag_name, match = self._find_next_tag(remaining) + tag_name = None + match = None + search_at = 0 + while True: + tag_name, match = self._find_next_tag(remaining, start_at=search_at) + if not match: + break + absolute_start = (len(text) - len(remaining)) + match.start() + if _is_inside_list_block(text, absolute_start): + # The legacy mistune 0.x list parser treats block-level content + # lazily; Zendesk tags that appear inside list items become plain + # text and are not recognized structurally. Avoid splitting the + # input at such tags, since that would terminate the list early. + search_at = match.start() + 1 + continue + break if not match: nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining))) break @@ -358,9 +706,46 @@ def _parse_nodes(self, text: str): prefix = remaining[:match.start()] nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix))) - content = match.group('content') + # The legacy parser only recognizes Zendesk tags when they consume the + # remainder of the current parsing slice (it uses `\\s*$` in the rule + # regex). Because of this, it will also match *across* multiple tag + # blocks of the same kind if the last closing tag is at the end. + # + # We emulate this by preferring a greedy match when it is terminal. + terminal_match = None + tail = remaining[match.start():] if tag_name == 'callout': - node = ZendeskHelpCallout(match.group('style')) + m2 = self._CALLOUT_PATTERN_MAX.match(tail) + elif tag_name == 'steps': + m2 = self._STEPS_PATTERN_MAX.match(tail) + else: + m2 = self._TABS_PATTERN_MAX.match(tail) + if m2 is not None and not tail[m2.end():].strip(): + terminal_match = m2 + + if terminal_match is None: + # Non-terminal: treat the first (minimal) tag block as opaque HTML. + nodes.append(Html(match.group(0))) + remaining = remaining[match.end():] + continue + + content = terminal_match.group('content') + trailing = tail[terminal_match.end():] + + if tag_name == 'callout': + attrs = (terminal_match.group('attrs') or '').strip() + styles = [part for part in attrs.split() if part] + if not styles: + node = ZendeskHelpCallout(None) + elif len(styles) == 1 and styles[0] in {'green', 'red', 'yellow'}: + node = ZendeskHelpCallout(styles[0]) + else: + # Invalid callout attrs: legacy parser does not treat this as a + # Zendesk callout block. Keep the first (minimal) tag as opaque + # HTML and continue parsing the remaining text. + nodes.append(Html(match.group(0))) + remaining = remaining[match.end():] + continue elif tag_name == 'steps': node = ZendeskHelpSteps() else: @@ -369,26 +754,26 @@ def _parse_nodes(self, text: str): node.add_nodes(self._parse_nodes(content)) nodes.append(node) - remaining = remaining[match.end():] + remaining = trailing return nodes - def _find_next_tag(self, text: str): - matches = [] + def _find_next_tag(self, text: str, start_at: int = 0): + best = None for name, pattern in ( - ('callout', self._CALLOUT_PATTERN), - ('steps', self._STEPS_PATTERN), - ('tabs', self._TABS_PATTERN), + ('callout', self._CALLOUT_PATTERN_MIN), + ('steps', self._STEPS_PATTERN_MIN), + ('tabs', self._TABS_PATTERN_MIN), ): - match = pattern.search(text) - if match: - matches.append((match.start(), name, match)) - if not matches: + for match in pattern.finditer(text, start_at): + candidate = (match.start(), name, match) + if best is None or candidate[0] < best[0]: + best = candidate + break + + if best is None: return None, None - matches.sort(key=lambda item: item[0]) - for _, name, match in matches: - if not _is_inside_fenced_block(text, match.start()): - return name, match - return None, None + _, name, match = best + return name, match def _parse_markdown(self, text: str): normalized = _remove_spaces_from_empty_lines(text) @@ -397,12 +782,33 @@ def _parse_markdown(self, text: str): def _append_text(nodes, text): - if not text: + if not text or not text.strip(): return - if nodes and isinstance(nodes[-1], Text): - nodes[-1].text += text - else: - nodes.append(Text(text)) + nodes.append(Text(text)) + + +def _split_text_on_legacy_markers(raw: str) -> list[str]: + """Split text into segments similar to mistune 0.x inline text tokenization. + + The legacy parser splits text at backticks and tildes (it stops before those + markers and then consumes them as separate text tokens). This matters for our + structural tree because each segment becomes its own Text node. + """ + if not raw: + return [] + markers = ('`', '~') + out = [] + i = 0 + n = len(raw) + while i < n: + j = n + for m in markers: + pos = raw.find(m, i + 1) + if pos != -1 and pos < j: + j = pos + out.append(raw[i:j]) + i = j + return out def _format_title(title: str) -> str: @@ -412,6 +818,21 @@ def _format_title(title: str) -> str: return f' "{escaped}"' +def _unquote_url_if_template(url: str) -> str: + """Undo Mistune's percent-encoding for template-like URLs. + + Mistune percent-encodes some characters in URLs (e.g. `{{url}}` becomes `%7B%7Burl%7D%7D`). + For structural diffs we don't care about URL contents, but we do want rendered markup to remain + readable and close to the original input. + """ + if not url or '%' not in url: + return url + unquoted = unquote(url) + if unquoted != url and ('{' in unquoted or '}' in unquoted): + return unquoted + return url + + def _format_link_markup(text: str, url: str, title: str | None) -> str: return f'[{text}]({url}{_format_title(title)})' @@ -427,7 +848,8 @@ def _is_block_html(raw: str) -> bool: match = re.match(r'<\/?\s*([a-zA-Z0-9]+)', stripped) if not match: return False - return match.group(1).lower() in _BLOCK_TAGS + tag = match.group(1).lower() + return tag not in _LEGACY_INLINE_TAGS def _normalize_block_indentation(text: str) -> str: @@ -449,33 +871,212 @@ def _normalize_block_indentation(text: str) -> str: return '\n'.join(lines).strip() +def _normalize_atx_heading_spaces(text: str) -> str: + """Normalize ATX headings that omit the mandatory space after the # markers. + + Mistune 3 follows CommonMark and requires a space: `## Heading`. The legacy parser + (mistune 0.x) accepted `##Heading` and our fixtures rely on that. + + We also normalize headings that appear right after list markers (e.g. `1. ##Heading`) + to keep list-item heading parsing compatible. + """ + output = [] + for line in text.splitlines(True): + match = _LIST_ITEM_ATX_HEADING_NO_SPACE_RE.match(line) + if match: + end = match.end(2) + line = f'{line[:end]} {line[end:]}' + else: + match = _ATX_HEADING_NO_SPACE_RE.match(line) + if match: + end = match.end(2) + line = f'{line[:end]} {line[end:]}' + + output.append(line) + return ''.join(output) + + +def _normalize_double_blank_line_list_nesting(text: str) -> str: + """Emulate mistune 0.x list nesting triggered by double blank lines. + + The legacy parser nests a following list under the previous list item when there + are two consecutive blank lines between list marker lines. Mistune 3 does not + do this, so we indent the subsequent marker to force a nested list. + """ + out = [] + prev_nonblank_was_list = False + prev_list_indent = 0 + blank_lines = 0 + for line in text.splitlines(True): + if not line.strip(): + blank_lines += 1 + out.append(line) + continue + + stripped = line.lstrip(' ') + current_indent = len(line) - len(stripped) + is_list = bool(_LIST_MARKER_RE.match(line)) + if is_list and prev_nonblank_was_list and blank_lines >= 2: + desired_indent = prev_list_indent + 4 + if current_indent < desired_indent: + line = (' ' * desired_indent) + stripped + current_indent = desired_indent + + out.append(line) + prev_nonblank_was_list = is_list + if is_list: + prev_list_indent = current_indent + blank_lines = 0 + return ''.join(out) + + +def _normalize_ordered_list_marker_interrupts(text: str) -> str: + """Allow ordered list markers like `2.` to interrupt paragraphs (mistune 0.x compat). + + Mistune 3 follows CommonMark and does not allow an ordered list starting with a + number other than 1 to interrupt a paragraph. Mistune 0.x is more permissive and + will start a list for `2.` / `3.` etc. + + To emulate the legacy behavior we insert a blank line before such ordered list + marker lines when they immediately follow non-list, non-blank text and we're not + currently inside a list block. + """ + out = [] + in_list = False + pending_list_end = False + prev_blank = True + prev_was_list_marker = False + + for line in text.splitlines(): + if not line.strip(): + out.append(line) + prev_blank = True + prev_was_list_marker = False + if in_list: + pending_list_end = True + continue + + if pending_list_end: + if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line): + # Still inside the list block. + pass + else: + in_list = False + pending_list_end = False + + if in_list and _REF_DEF_LINE_RE.match(line): + in_list = False + + ordered = _ORDERED_LIST_MARKER_RE.match(line) + if not in_list and not prev_blank and ordered: + number = int(ordered.group(1)) + if number != 1 and not prev_was_list_marker: + out.append('') + prev_blank = True + prev_was_list_marker = False + + out.append(line) + prev_blank = False + prev_was_list_marker = bool(_LIST_MARKER_RE.match(line)) + if prev_was_list_marker: + in_list = True + + return '\n'.join(out) + + +def _normalize_list_lazy_continuations(text: str) -> str: + """Emulate mistune 0.x lazy list continuations for block-start lines. + + Mistune 3 follows CommonMark and will break a list when it encounters a + block-start line (e.g. `###### Heading`) that is not indented as a list-item + continuation. Mistune 0.x is much more permissive and will keep consuming + unindented lines as part of the current list item until the list is closed + by a blank line. + + We emulate the legacy behavior by indenting unindented non-marker lines while + inside a list block so that mistune 3 keeps them as list-item continuation + lines. + """ + out = [] + in_list = False + pending_list_end = False + continue_prefix = '' + + for raw_line in text.splitlines(True): + has_nl = raw_line.endswith('\n') + line = raw_line[:-1] if has_nl else raw_line + + if not line.strip(): + out.append(raw_line) + if in_list: + pending_list_end = True + continue + + if pending_list_end: + if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line): + # Still inside the list block. + pass + else: + in_list = False + continue_prefix = '' + pending_list_end = False + + marker_match = _LIST_MARKER_RE.match(line) + if marker_match: + in_list = True + continue_prefix = ' ' * marker_match.end() + out.append(raw_line) + continue + + # Mistune 0.x list parsing stops before reference definition lines, even + # without blank lines. Treat those as list terminators so following blocks + # don't get indented into the list item. + if in_list and _REF_DEF_LINE_RE.match(line): + in_list = False + continue_prefix = '' + out.append(raw_line) + continue + + if in_list and line[:1] not in {' ', '\t'}: + normalized = f'{continue_prefix}{line}' + if has_nl: + normalized += '\n' + out.append(normalized) + continue + + out.append(raw_line) + + return ''.join(out) + + def _extract_reference_definitions(text: str): lines = text.splitlines() output = [] definitions = {} - fence = None - fence_len = 0 counter = 0 - for line in lines: - fence_match = _FENCE_RE.match(line) - if fence_match: - marker = fence_match.group(1) - marker_len = len(marker) - marker_char = marker[0] - if fence is None: - fence = marker_char - fence_len = marker_len - elif marker_char == fence and marker_len >= fence_len: - fence = None - fence_len = 0 - output.append(line) - continue - - if fence is None and _REF_DEF_LINE_RE.match(line): + for idx, line in enumerate(lines): + if _REF_DEF_LINE_RE.match(line): placeholder = f"SDIFF_REF_DEF_{counter}" counter += 1 definitions[placeholder] = line.strip() + # The legacy parser treats reference definition lines as their own blocks + # (even without blank lines) and they must also not become lazy-continuation + # lines inside list items. Force block separation. + if output and output[-1].strip(): + output.append('') output.append(placeholder) + # Special-case: When a reference definition is followed by a fence-only line, + # and after blank lines another fence-only line begins, mistune 0.x tends to + # split the ref def into its own paragraph (it doesn't keep it glued to the + # closing fence marker). Insert a blank line after the placeholder to match. + if idx + 1 < len(lines) and _FENCE_ONLY_LINE_RE.match(lines[idx + 1]): + j = idx + 2 + # Only split when there is at least one blank line between fences. + if j < len(lines) and not lines[j].strip(): + while j < len(lines) and not lines[j].strip(): + j += 1 + if j < len(lines) and _FENCE_ONLY_LINE_RE.match(lines[j]): + output.append('') continue output.append(line) @@ -505,6 +1106,59 @@ def _is_inside_fenced_block(text: str, offset: int) -> bool: return False +def _is_inside_list_block(text: str, offset: int) -> bool: + """Best-effort mistune 0.x list-block detection. + + Mistune 0.x list parsing is permissive and supports lazy continuation lines. + For compatibility we treat everything following a list marker as being inside + the list block until a blank line is followed by a non-indented, non-list + marker line. + + We also treat reference definition lines as list terminators even without + blank lines (legacy behavior). + """ + in_list = False + pending_list_end = False + running = 0 + + for raw_line in text.splitlines(True): + line_len = len(raw_line) + line = raw_line[:-1] if raw_line.endswith('\n') else raw_line + + if not line.strip(): + if in_list: + pending_list_end = True + if running + line_len > offset: + return in_list + running += line_len + continue + + if pending_list_end: + if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line): + # Still inside the list block. + pass + else: + in_list = False + pending_list_end = False + + # Mistune 0.x list parsing stops before reference definition lines, even + # without blank lines. + if in_list and _REF_DEF_LINE_RE.match(line): + in_list = False + + line_is_list_marker = bool(_LIST_MARKER_RE.match(line)) + line_in_list = in_list or line_is_list_marker + if running + line_len > offset: + return line_in_list + + if line_is_list_marker: + in_list = True + + running += line_len + + return False + + def _remove_spaces_from_empty_lines(text): return '\n'.join([re.sub(r'^( {1,}|\t{1,})$', '\n', line) for line in text.splitlines()]) @@ -513,15 +1167,201 @@ def _remove_ltr_rtl_marks(text): return re.sub(r'(\u200e|\u200f)', '', text) +def _normalize_consecutive_fence_lines(text: str) -> str: + """Split consecutive fence-marker lines into separate blocks. + + The legacy parser tends to break paragraphs at repeated fence marker lines + like: + ~~~~ + ~~~~ + We insert a blank line between consecutive fence-only lines to keep block + structure compatible. + """ + out = [] + prev_was_fence = False + for line in text.splitlines(): + is_fence = bool(_FENCE_ONLY_LINE_RE.match(line)) + if is_fence and prev_was_fence and out and out[-1].strip(): + out.append('') + out.append(line) + prev_was_fence = is_fence + return '\n'.join(out) + + +def _normalize_consecutive_blockquote_lines(text: str) -> str: + """Split consecutive `>` quote lines into separate blocks. + + Mistune 0.x tends to break paragraphs on each quote-marker line when block quote + syntax isn't enabled in the lexer. We emulate that by inserting blank lines + between consecutive quote lines. + """ + out = [] + in_list = False + pending_list_end = False + for line in text.splitlines(): + if not line.strip(): + out.append(line) + if in_list: + pending_list_end = True + continue + + if pending_list_end: + if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line): + # Still inside the list block. + pass + else: + in_list = False + pending_list_end = False + + # Mistune 0.x list parsing stops before reference definition lines, even + # without blank lines. Treat those as list terminators for normalization + # purposes. + if in_list and _REF_DEF_LINE_RE.match(line): + in_list = False + + is_quote = bool(_BLOCKQUOTE_LINE_RE.match(line)) + if is_quote and out and out[-1].strip() and not in_list: + out.append('') + out.append(line) + + if _LIST_MARKER_RE.match(line): + in_list = True + return '\n'.join(out) + + +def _normalize_fence_block_starts(text: str) -> str: + """Force mistune 0.x paragraph breaks before complete fence blocks. + + Mistune 0.x's `paragraph` regex stops when a *complete* fence block (as defined + by its `fences` regex) starts on the next line. We disable fence parsing, but + still need the same paragraph splitting behavior for structural diffs. + + We insert a blank line before any line that begins a fence block according to + the mistune 0.x `fences` regex. + + NOTE: This is intentionally restricted to non-indented lines to avoid + perturbing list-item parsing; legacy list items don't use paragraph parsing + either (they tokenize as plain text). + """ + if not text: + return text + + insert_positions = set() + in_list = False + pending_list_end = False + prev_blank = True + + offset = 0 + for raw_line in text.splitlines(True): + line_start = offset + offset += len(raw_line) + line = raw_line[:-1] if raw_line.endswith('\n') else raw_line + + if not line.strip(): + prev_blank = True + if in_list: + pending_list_end = True + continue + + if pending_list_end: + if line[:1] in {' ', '\t'} or _LIST_MARKER_RE.match(line): + # Still inside the list block. + pass + else: + in_list = False + pending_list_end = False + + # Mistune 0.x list parsing stops before reference definition lines, even + # without blank lines. Treat those as list terminators for normalization + # purposes. + if in_list and _REF_DEF_LINE_RE.match(line): + in_list = False + + if _LIST_MARKER_RE.match(line): + in_list = True + + first = line[:1] + if not in_list and not prev_blank and first in {'`', '~'} and first not in {' ', '\t'}: + if _MISTUNE08_FENCE_BLOCK_RE.match(text, line_start): + insert_positions.add(line_start) + + prev_blank = False + + if not insert_positions: + return text + + out = text + for start in sorted(insert_positions, reverse=True): + out = out[:start] + '\n' + out[start:] + return out + + +def _normalize_fence_only_lines_start_new_paragraphs(text: str) -> str: + """Force fence-only lines to start new paragraphs like mistune 0.x. + + The legacy parser breaks paragraphs when it encounters a fence-only marker line + (``` / ~~~) even though it doesn't parse fences as code blocks. Mistune 3 tends to + keep those markers inside a paragraph when fenced code parsing is disabled. + """ + out = [] + prev_was_blank = True + in_fence_paragraph = False + for line in text.splitlines(True): + if not line.strip(): + out.append(line) + prev_was_blank = True + in_fence_paragraph = False + continue + + is_fence = bool(_FENCE_ONLY_LINE_RE.match(line)) + if is_fence and not prev_was_blank and not in_fence_paragraph: + out.append('\n') + prev_was_blank = True + + out.append(line) + if prev_was_blank: + in_fence_paragraph = is_fence + prev_was_blank = False + return ''.join(out) + + +def _merge_adjacent_lists(nodes): + """Merge directly-adjacent list blocks. + + The legacy parser is quite permissive and tends to merge adjacent lists even + when bullet markers or orderedness changes. Normalizing this reduces spurious + structural diffs vs `master`. + """ + merged = [] + for node in nodes: + # Recurse first. + if getattr(node, 'nodes', None): + node.nodes = _merge_adjacent_lists(node.nodes) + + if merged and isinstance(node, List) and isinstance(merged[-1], List): + merged[-1].add_nodes(node.nodes) + continue + merged.append(node) + return merged + + def parse(text, parser_cls: type[MdParser] = MdParser): """Parse Markdown into a Root node using the given parser class.""" text = _remove_spaces_from_empty_lines(text) text = _remove_ltr_rtl_marks(text) + text = _normalize_atx_heading_spaces(text) + text = _normalize_double_blank_line_list_nesting(text) + text = _normalize_ordered_list_marker_interrupts(text) + text = _normalize_list_lazy_continuations(text) + text = _normalize_consecutive_blockquote_lines(text) + text = _normalize_fence_block_starts(text) parser = parser_cls() if hasattr(parser, '_set_reference_definitions'): text, reference_definitions = _extract_reference_definitions(text) parser._set_reference_definitions(reference_definitions) result = parser.parse(text) if isinstance(result, list): - return Root(result) + root = Root(result) + root.nodes = _merge_adjacent_lists(root.nodes) + return root return result diff --git a/tests/test_parser.py b/tests/test_parser.py index 5d1c9c7..18f137e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -69,9 +69,30 @@ def test_heading_text(self): actual = self._parse('### heading') self.assertEqual('heading', actual.nodes[0].nodes[0].text) + def test_heading_without_space_followed_by_text_parses_as_header(self): + actual = self._parse('##Heading\ntext') + self.assertEqual('2tpt', actual.print_all()) + + def test_heading_without_space_with_link_parses_as_header(self): + actual = self._parse('##[Verify email]({{url}})\ntext') + self.assertEqual('header', actual.nodes[0].name) + self.assertEqual(2, actual.nodes[0].level) + self.assertEqual('link', actual.nodes[0].nodes[0].name) + self.assertEqual('[Verify email]({{url}})', actual.nodes[0].nodes[0].text) + + def test_heading_without_space_in_list_item_followed_by_text(self): + actual = self._parse('1. ##Heading\n text') + self.assertEqual('lm2tt', actual.print_all()) + def test_link_wrapped_in_text(self): self._run_and_assert('some text [link](url) new text', 'ptat') + def test_text_before_link_not_duplicated(self): + actual = self._parse('some text and [link](url)') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'link'], [node.name for node in paragraph.nodes]) + self.assertEqual(['some text and '], [node.text for node in paragraph.nodes if node.name == 'text']) + def test_link_label_with_codespan(self): actual = self._parse('[use `foo`](url)') self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text) @@ -99,7 +120,8 @@ def test_reference_definition_inside_list_item_preserved(self): data = '- item\n [id]: https://example.com' tree = self._parse(data) list_item = tree.nodes[0].nodes[0] - self.assertIn('[id]: https://example.com', list_item.nodes[0].text) + self.assertEqual('item', list_item.nodes[0].text) + self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text) def test_reference_links_with_whitespace_and_empty_id(self): data = 'See [API][] and [Ref] [id].\n\n[API]: https://example.com\n[id]: https://example.com' @@ -114,7 +136,7 @@ def test_reference_definition_inside_fence_is_text(self): [link][id] ```""" tree = self._parse(data) - self.assertEqual('pt', tree.print_all()) + self.assertEqual('ptttptattt', tree.print_all()) def test_reference_definition_inside_long_fence_is_text(self): data = """```` @@ -122,7 +144,7 @@ def test_reference_definition_inside_long_fence_is_text(self): [link][id] ````""" tree = self._parse(data) - self.assertEqual('pt', tree.print_all()) + self.assertEqual('pttttptatttt', tree.print_all()) def test_softbreak_preserves_space(self): actual = self._parse('hello\nworld') @@ -134,18 +156,40 @@ def test_block_quote_preserves_marker(self): def test_fenced_code_preserves_fences(self): actual = self._parse('```\ncode\n```') - self.assertEqual('```\ncode\n```', actual.nodes[0].nodes[0].text) + self.assertEqual('ptttttt', actual.print_all()) + text = ''.join(node.text for node in actual.nodes[0].nodes) + self.assertTrue(text.startswith('```')) + self.assertTrue(text.endswith('```')) def test_ordered_list_parses_as_ordered(self): tree = self._parse('1. one\n2. two') list_node = tree.nodes[0] self.assertTrue(list_node.ordered) + def test_ordered_list_marker_other_than_1_interrupts_paragraph(self): + self._run_and_assert('para\n2. item\n', 'ptlmt') + + def test_list_item_allows_unindented_heading_lazy_continuation(self): + tree = self._parse('* a\n###### b\n') + self.assertEqual(1, len(tree.nodes)) + self.assertEqual('list', tree.nodes[0].name) + item = tree.nodes[0].nodes[0] + self.assertEqual(['text', 'header'], [node.name for node in item.nodes]) + self.assertEqual('a', item.nodes[0].text) + self.assertEqual(6, item.nodes[1].level) + self.assertEqual('b', item.nodes[1].nodes[0].text) + def test_unordered_list_parses_as_unordered(self): tree = self._parse('- one\n- two') list_node = tree.nodes[0] self.assertFalse(list_node.ordered) + def test_double_blank_lines_between_list_items_nests_next_list(self): + self._run_and_assert('* a\n\n\n* b\n', 'lmtlmt') + + def test_double_blank_lines_between_ordered_list_items_nests_next_list(self): + self._run_and_assert('1. a\n\n\n1. b\n', 'lmtlmt') + class TestZendeskParser(ParserTestCase): def setUp(self) -> None: @@ -181,6 +225,22 @@ def test_callout_invalid_style(self): actual = self._parse(fixture) self.assertNotEqual(actual.nodes[0].name, 'callout') + def test_callout_invalid_style_does_not_swallow_trailing_closing_tag(self): + fixture = '\n# title\ncontent\n\n\n' + self._run_and_assert(fixture, 'xpt') + + def test_callout_tags_inside_list_item_are_text_and_allow_headings(self): + fixture = '1. item\n\n# title\ncontent\n\n' + tree = self._parse(fixture) + self.assertEqual(1, len(tree.nodes)) + self.assertEqual('list', tree.nodes[0].name) + item = tree.nodes[0].nodes[0] + self.assertEqual(['text', 'text', 'header', 'text', 'text'], [node.name for node in item.nodes]) + self.assertEqual('<callout>', item.nodes[1].text) + self.assertEqual(1, item.nodes[2].level) + self.assertEqual('title', item.nodes[2].nodes[0].text) + self.assertEqual('</callout>', item.nodes[-1].text) + def test_tabs(self): fixture = """ @@ -192,12 +252,12 @@ def test_tabs(self): """ self._run_and_assert(fixture, 'T1tpt1tpt') - def test_inline_callout_is_structural(self): + def test_inline_callout_is_not_structural(self): fixture = """intro # title content outro""" - self._run_and_assert(fixture, 'ptC1tptpt') + self._run_and_assert(fixture, 'pt1tpt') def test_zendesk_tags_inside_fenced_code_are_text(self): fixture = """``` @@ -214,9 +274,26 @@ def test_zendesk_tags_inside_fenced_code_are_text(self): ```""" tree = self._parse(fixture) - self.assertEqual('pt', tree.print_all()) + self.assertEqual('ptttxxxpttt', tree.print_all()) self.assertFalse(any(node.name in {'callout', 'steps', 'tabs'} for node in tree.nodes)) + def test_zendesk_tags_after_fenced_code_are_parsed(self): + fixture = """``` + +# title +content + +``` + + +# title +content + +""" + tree = self._parse(fixture) + self.assertTrue(any(node.name == 'callout' for node in tree.nodes)) + self.assertEqual(1, tree.print_all().count('C')) + def test_steps(self): steps_fixture = """ diff --git a/tests/test_sdiff.py b/tests/test_sdiff.py index db8bf45..55d3079 100644 --- a/tests/test_sdiff.py +++ b/tests/test_sdiff.py @@ -63,6 +63,18 @@ def test_softbreaks_ignored_in_structure(self): _, _, errors = sdiff.diff(left, right) self.assertEqual([], errors) + def test_heading_without_space_matches_heading_with_space(self): + left = '##Heading\ntext' + right = '## Heading\ntext' + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + + def test_list_heading_without_space_matches_heading_with_space(self): + left = '1. ##Heading\n text' + right = '1. ## Heading\n text' + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + def test_reference_definition_missing_is_reported(self): left = 'See [API][id].\n\n[id]: https://example.com' right = 'See [API][id].' @@ -78,3 +90,22 @@ def test_code_block_content_ignored_in_structure(self): ```""" _, _, errors = sdiff.diff(left, right) self.assertEqual([], errors) + + def test_invalid_callout_followed_by_fence_does_not_depend_on_blank_line(self): + left = """ +# title +content + + +``` +code +```""" + right = """ +# title +content + +``` +code +```""" + _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser) + self.assertEqual([], errors) From 2f68918016b7aeb96d886b19771f86b1c25734bd Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Fri, 6 Feb 2026 12:16:10 -0800 Subject: [PATCH 15/18] tests: cover inline buffer flush duplication --- tests/test_parser.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index 18f137e..965e8f5 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,6 +1,7 @@ from unittest import TestCase from sdiff import parser, MdParser, ZendeskHelpMdParser from sdiff.model import Paragraph, Root, Text, ZendeskHelpSteps +from sdiff.renderer import TextRenderer class ParserTestCase(TestCase): @@ -87,6 +88,33 @@ def test_heading_without_space_in_list_item_followed_by_text(self): def test_link_wrapped_in_text(self): self._run_and_assert('some text [link](url) new text', 'ptat') + def test_link_with_trailing_text_does_not_duplicate_buffer(self): + actual = self._parse('some text [link](url) new text') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'link', 'text'], [node.name for node in paragraph.nodes]) + self.assertEqual('some text ', paragraph.nodes[0].text) + self.assertEqual('[link](url)', paragraph.nodes[1].text) + self.assertEqual(' new text', paragraph.nodes[2].text) + + def test_image_with_trailing_text_does_not_duplicate_buffer(self): + actual = self._parse('some ![alt](url) new') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'image', 'text'], [node.name for node in paragraph.nodes]) + self.assertEqual('some ', paragraph.nodes[0].text) + self.assertEqual('![alt](url)', paragraph.nodes[1].text) + self.assertEqual(' new', paragraph.nodes[2].text) + + def test_inline_marker_does_not_duplicate_buffer(self): + actual = self._parse('some **bold** text') + self.assertEqual('some **bold** text', TextRenderer().render(actual)) + + def test_inline_linebreak_does_not_duplicate_buffer(self): + actual = self._parse('a\\\nb') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'new-line', 'text'], [node.name for node in paragraph.nodes]) + self.assertEqual('a', paragraph.nodes[0].text) + self.assertEqual('b', paragraph.nodes[2].text) + def test_text_before_link_not_duplicated(self): actual = self._parse('some text and [link](url)') paragraph = actual.nodes[0] From 4e447aef3aad1a0e0de75d41d0cafdb8e2ff0668 Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:23:33 -0800 Subject: [PATCH 16/18] chore: align flake8 config with repo defaults --- .flake8 | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.flake8 b/.flake8 index aac8e09..7148fd6 100644 --- a/.flake8 +++ b/.flake8 @@ -1,8 +1,6 @@ [flake8] max-line-length = 120 -max-complexity = 12 -select = E,F,W,C90 -extend-ignore = F403,F405 +ignore = F403,F405 exclude = .git, __pycache__, From 2684d49306f1dfa25a96927bff7fb01cd881b95a Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:23:39 -0800 Subject: [PATCH 17/18] tests: raise coverage for parser and helpers --- tests/test_coverage.py | 416 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 416 insertions(+) create mode 100644 tests/test_coverage.py diff --git a/tests/test_coverage.py b/tests/test_coverage.py new file mode 100644 index 0000000..92e88c4 --- /dev/null +++ b/tests/test_coverage.py @@ -0,0 +1,416 @@ +from unittest import TestCase + +import sdiff.compare as compare_mod +from sdiff import MdParser, parser +from sdiff.compare import diff_struct +from sdiff.errors import InsertError +from sdiff.model import Header, Link, List, ListItem, NewLine, Root, Text, ZendeskHelpCallout +from sdiff.renderer import TextRenderer +from tests.fixtures import trees + + +class TestCoverageMisc(TestCase): + def test_diff_error_str_uses_message(self): + err = InsertError(Text("x")) + self.assertIn("missing element", str(err)) + + def test_node_str_repr_and_eq(self): + node = Root([Text("x")]) + self.assertTrue(str(node)) + self.assertIn("root", repr(node)) + self.assertNotEqual(node, "not-a-node") + + def test_header_str_repr(self): + header = Header(3, [Text("x")]) + self.assertEqual("3", str(header)) + self.assertIn("level", repr(header)) + + def test_list_and_link_repr_and_eq_branches(self): + self.assertFalse(List(False) == "nope") # noqa: E711 + self.assertIn("ordered", repr(List(False))) + self.assertIn("link", repr(Link("x"))) + self.assertIn("new-line", repr(NewLine())) + self.assertIn("callout", repr(ZendeskHelpCallout("green"))) + self.assertFalse(ZendeskHelpCallout("green") == "nope") # noqa: E711 + + def test_fixture_empty_tree(self): + self.assertEqual("", trees.empty_tree().print_all()) + + def test_diff_struct_ignores_single_space_nodes(self): + # Cover the "ignore single space errors" branch in compare.py. + tree1 = Root([Text(" "), Text("x")]) + tree2 = Root([Text("x")]) + _, _, errors = diff_struct(tree1, tree2) + self.assertEqual(0, len(errors)) + + def test_apply_diff_ranges_ignores_single_space_nodes(self): + # Cover the "ignore single space errors" branches in compare.py explicitly. + delete_only = [("x", 0, 1, 0, 0)] + insert_only = [("x", 0, 0, 0, 1)] + + errors = compare_mod._apply_diff_ranges(delete_only, [Text(" ")], []) + self.assertEqual([], errors) + + errors = compare_mod._apply_diff_ranges(insert_only, [], [Text(" ")]) + self.assertEqual([], errors) + + errors = compare_mod._apply_diff_ranges(delete_only, [Text("x")], []) + self.assertEqual(1, len(errors)) + self.assertIn("additional element", str(errors[0])) + + errors = compare_mod._apply_diff_ranges(insert_only, [], [Text("x")]) + self.assertEqual(1, len(errors)) + self.assertIn("missing element", str(errors[0])) + + +class TestCoverageParserHelpers(TestCase): + def test_split_legacy_block_html_variants(self): + self.assertIsNone(parser._split_legacy_block_html("")) + self.assertIsNone(parser._split_legacy_block_html("not html\n")) + + # Exact match should return None (no suffix to split). + self.assertIsNone(parser._split_legacy_block_html("
hi
\n")) + + prefix, suffix = parser._split_legacy_block_html("
hi
\n\nnext") + self.assertTrue(prefix.startswith("
hi
")) + self.assertEqual("next", suffix) + + def test_block_parser_disabled_rules_return_none(self): + block = parser._SdiffBlockParser() + self.assertIsNone(block.parse_fenced_code(None, None)) + self.assertIsNone(block.parse_block_quote(None, None)) + + def test_mdparser_get_lexer_returns_instance(self): + self.assertIsInstance(MdParser.get_lexer(), MdParser) + + def test_split_text_on_legacy_markers(self): + self.assertEqual([], parser._split_text_on_legacy_markers("")) + self.assertEqual(["a", "`b", "`c"], parser._split_text_on_legacy_markers("a`b`c")) + + def test_unquote_url_if_template(self): + url = "https://example.com/%7B%7Burl%7D%7D" + self.assertIn("{{url}}", parser._unquote_url_if_template(url)) + # Percent-encoded but not template-like => keep as-is. + self.assertEqual("https://example.com/%2F", parser._unquote_url_if_template("https://example.com/%2F")) + + def test_is_block_html(self): + self.assertTrue(parser._is_block_html("")) + self.assertFalse(parser._is_block_html("text")) + self.assertTrue(parser._is_block_html("
text
")) + self.assertFalse(parser._is_block_html("nope")) + + def test_normalize_block_indentation(self): + # Only non-HTML lines should be considered for min-indent normalization. + raw = "
\n x\n
\n y" + normalized = parser._normalize_block_indentation(raw) + self.assertIn("y", normalized) + + def test_extract_reference_definitions_fence_special_case(self): + raw = "[id]: https://example.com\n```\n\n```" + text, defs = parser._extract_reference_definitions(raw) + self.assertEqual(1, len(defs)) + # The special-case inserts a blank line after the placeholder. + self.assertTrue(text.startswith("SDIFF_REF_DEF_0\n\n")) + + def test_extract_reference_definitions_fence_special_case_not_triggered_without_blank_line(self): + raw = "[id]: https://example.com\n```\n```" + text, defs = parser._extract_reference_definitions(raw) + self.assertEqual(1, len(defs)) + self.assertEqual("SDIFF_REF_DEF_0\n```\n```", text) + + def test_is_inside_fenced_block(self): + raw = "```\ncode\n```\noutside" + # Offset inside "code". + self.assertTrue(parser._is_inside_fenced_block(raw, raw.index("code"))) + # Offset inside "outside". + self.assertFalse(parser._is_inside_fenced_block(raw, raw.index("outside"))) + # Offset past end => fall through. + self.assertFalse(parser._is_inside_fenced_block(raw, len(raw) + 1)) + + def test_is_inside_list_block(self): + raw = "- a\n b\n\nc" + self.assertTrue(parser._is_inside_list_block(raw, raw.index("b"))) + self.assertFalse(parser._is_inside_list_block(raw, raw.index("c"))) + # Offset past end => fall through. + self.assertFalse(parser._is_inside_list_block(raw, len(raw) + 1)) + + def test_normalize_consecutive_fence_lines(self): + raw = "```\n```\ntext" + normalized = parser._normalize_consecutive_fence_lines(raw) + self.assertIn("```\n\n```", normalized) + + def test_normalize_consecutive_blockquote_lines(self): + raw = "> a\n> b\nc" + normalized = parser._normalize_consecutive_blockquote_lines(raw) + self.assertIn("> a\n\n> b", normalized) + + def test_normalize_fence_only_lines_start_new_paragraphs(self): + raw = "a\n```\nb" + normalized = parser._normalize_fence_only_lines_start_new_paragraphs(raw) + self.assertIn("a\n\n```", normalized) + # Blank line resets state. + normalized = parser._normalize_fence_only_lines_start_new_paragraphs("a\n\n```\n\n```") + self.assertIn("\n\n```\n\n```", normalized) + + def test_normalize_double_blank_line_list_nesting_does_not_overindent(self): + raw = "* a\n\n\n * b\n" + normalized = parser._normalize_double_blank_line_list_nesting(raw) + self.assertEqual(raw, normalized) + + def test_merge_adjacent_lists(self): + l1 = List(False, [ListItem([Text("a")])]) + l2 = List(True, [ListItem([Text("b")])]) + root = Root([l1, l2]) + merged = parser._merge_adjacent_lists(root.nodes) + self.assertEqual(1, len(merged)) + self.assertEqual(2, len(merged[0].nodes)) + + def test_parse_passthrough_when_parser_returns_non_list(self): + class _Dummy(MdParser): + def parse(self, text, rules=None): # noqa: ANN001 + return Root([Text("x")]) + + parsed = parser.parse("x", parser_cls=_Dummy) + self.assertIsInstance(parsed, Root) + + +class TestCoverageParserConversions(TestCase): + def setUp(self) -> None: + super().setUp() + self.p = MdParser() + + def test_convert_block_token_branches(self): + item = self.p._convert_block_token( + { + "type": "list_item", + "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}], + } + )[0] + self.assertEqual("list-item", item.name) + + block_text = self.p._convert_block_token({"type": "block_text", "children": [{"type": "text", "raw": "x"}]})[0] + self.assertEqual("paragraph", block_text.name) + + quote = self.p._convert_block_token( + {"type": "block_quote", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "q"}]}]} + )[0] + self.assertEqual("paragraph", quote.name) + self.assertIn(">", quote.nodes[0].text) + + code = self.p._convert_block_token({"type": "block_code", "raw": "code\n", "marker": "```"})[0] + self.assertTrue(code.nodes[0].text.startswith("```")) + + def test_convert_list_ordered_attr_fallback(self): + lst = self.p._convert_list({"type": "list", "attrs": {"ordered": True}, "children": []}) + self.assertTrue(lst.ordered) + + def test_convert_block_html_with_suffix(self): + token = {"type": "block_html", "raw": "
hi
\n\ntext"} + nodes = self.p._convert_block_html(token) + self.assertEqual("html", nodes[0].name) + self.assertEqual("paragraph", nodes[1].name) + + # Split happens, but suffix is whitespace-only => no extra nodes. + token = {"type": "block_html", "raw": "
hi
\n\n "} + nodes = self.p._convert_block_html(token) + self.assertEqual(1, len(nodes)) + + # Whitespace-only raw => empty conversion. + self.assertEqual([], self.p._convert_block_html({"type": "block_html", "raw": " "})) + + def test_convert_passthrough_block_children_and_raw(self): + out = self.p._convert_passthrough_block( + {"type": "unknown", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}]} + ) + self.assertEqual("paragraph", out[0].name) + out2 = self.p._convert_passthrough_block({"type": "unknown", "raw": "raw"}) + self.assertEqual("paragraph", out2[0].name) + + def test_convert_block_quote_early_returns(self): + self.assertEqual([], self.p._convert_block_quote({"type": "block_quote", "children": []})) + self.assertEqual( + [], + self.p._convert_block_quote({"type": "block_quote", "children": [{"type": "paragraph", "children": []}]}), + ) + + def test_render_inline_children_unknown_child_type(self): + out = self.p._render_inline_children([{"type": "thematic_break", "raw": "---"}]) + self.assertEqual("---", out) + + def test_inline_other_and_codespan_text_fallback(self): + tokens = [{"type": "codespan", "text": "x"}, {"type": "unknown", "raw": ""}] + out = self.p._convert_inline_tokens(tokens) + self.assertEqual("`x`<x>", "".join(node.text for node in out)) + + def test_inline_marker_without_children_and_inline_other_with_children(self): + out = self.p._convert_inline_tokens([{"type": "strong", "children": []}]) + self.assertEqual(["text", "text"], [n.name for n in out]) + + out = self.p._convert_inline_tokens([{"type": "unknown", "children": [{"type": "text", "raw": "x"}]}]) + self.assertEqual("x", out[0].text) + + out = self.p._convert_inline_tokens([{"type": "unknown", "raw": " "}]) + self.assertEqual([], out) + + def test_flatten_inline_text_unknown_branches(self): + text = self.p._flatten_inline_text( + [ + {"type": "codespan", "raw": "x"}, + {"type": "unknown", "children": [{"type": "text", "raw": "y"}]}, + {"type": "unknown", "raw": "z"}, + ] + ) + self.assertIn("`x`", text) + self.assertTrue(text.endswith("z")) + + def test_flatten_inline_markup_link_and_image(self): + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "link", "children": [{"type": "text", "raw": "L"}], "attrs": {"url": "%7B%7Burl%7D%7D"}}, + {"type": "softbreak"}, + {"type": "image", "children": [{"type": "text", "raw": "A"}], "attrs": {"url": "u", "title": 't"'}}, + ] + s = self.p._flatten_inline_markup(tokens, softbreak_as_newline=True) + self.assertIn("[L]({{url}})", s) + self.assertIn('![A](u "t\\"")', s) + + def test_flatten_inline_markup_unknown_branches(self): + tokens = [ + {"type": "unknown", "children": [{"type": "text", "raw": "x"}]}, + {"type": "unknown", "raw": "y"}, + ] + s = self.p._flatten_inline_markup(tokens) + self.assertEqual("xy", s) + + def test_convert_list_block_nodes_ref_heading_and_text(self): + self.p._set_reference_definitions( + { + "SDIFF_REF_DEF_0": "[id]: https://example.com", + "[id]: https://example.com": "[id]: https://example.com", + } + ) + tokens = [ + {"type": "text", "raw": "SDIFF_REF_DEF_0"}, + {"type": "softbreak"}, + {"type": "text", "raw": "###header"}, + {"type": "softbreak"}, + {"type": "text", "raw": " "}, + {"type": "softbreak"}, + {"type": "text", "raw": "plain"}, + ] + nodes = self.p._convert_list_block_nodes(tokens) + self.assertEqual(["text", "header", "text"], [n.name for n in nodes]) + + def test_convert_list_block_nodes_empty(self): + self.assertEqual([], self.p._convert_list_block_nodes([])) + + def test_heading_from_inline_fallback_branch(self): + class _NoHeading(MdParser): + def __init__(self): + super().__init__() + self._markdown = lambda _: [{"type": "paragraph", "children": []}] # noqa: E731 + + p = _NoHeading() + heading = p._heading_from_inline([{"type": "text", "raw": "###header"}]) + self.assertEqual("header", heading.name) + self.assertEqual("text", heading.nodes[0].name) + + def test_convert_paragraph_or_heading_ref_and_heading(self): + self.p._set_reference_definitions({"SDIFF_REF_DEF_0": "[id]: https://example.com"}) + node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "SDIFF_REF_DEF_0"}]) + self.assertEqual("paragraph", node.name) + + node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "###header"}]) + self.assertEqual("header", node.name) + + node = self.p._convert_paragraph_token([{"type": "text", "raw": "###header"}])[0] + self.assertEqual("header", node.name) + + def test_split_paragraph_inline_on_fence_variants(self): + self.assertIsNone(self.p._split_paragraph_inline_on_fence([])) + self.assertIsNone(self.p._split_paragraph_inline_on_fence([{"type": "text", "raw": "x"}])) + + # First line is a fence-only marker => do not split. + tokens = [{"type": "text", "raw": "```"}, {"type": "softbreak"}, {"type": "text", "raw": "x"}] + self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens)) + + # Tail is fence markers but not a complete fence block => do not split. + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + ] + self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens)) + + # Complete fence block tail => split. + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + {"type": "softbreak"}, + {"type": "text", "raw": "code"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + ] + parts = self.p._split_paragraph_inline_on_fence(tokens) + self.assertEqual(2, len(parts)) + + nodes = self.p._convert_paragraph_token(tokens) + self.assertEqual(2, len(nodes)) + + def test_split_paragraph_inline_on_fence_first_part_includes_seps(self): + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "text", "raw": "b"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + {"type": "softbreak"}, + {"type": "text", "raw": "code"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + ] + parts = self.p._split_paragraph_inline_on_fence(tokens) + self.assertEqual(2, len(parts)) + + def test_convert_list_item_block_html_text_smoke(self): + # Exercise conversion of text following a (hypothetical) HTML block inside a list item. + nodes = self.p._convert_list_item_block_html_text("text\n\n# h\n\n- a\n") + self.assertTrue(any(n.name == "header" for n in nodes)) + self.assertTrue(any(n.name == "list" for n in nodes)) + + def test_convert_list_item_with_block_html_child(self): + token = { + "type": "list_item", + "children": [ + {"type": "block_html", "raw": "
hi
"}, + ], + } + item = self.p._convert_list_item(token) + self.assertTrue(item.nodes) + + def test_convert_list_item_block_html_variants(self): + self.assertEqual([], self.p._convert_list_item_block_html({"type": "block_html", "raw": " "})) + + nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "not html\n"}) + self.assertTrue(nodes) + + nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "
hi
\n\n "}) + self.assertTrue(nodes) + + def test_convert_list_item_block_html_text_with_block_html_and_raw(self): + nodes = self.p._convert_list_item_block_html_text("
hi
\n\n---\n") + self.assertTrue(any(n.name == "text" for n in nodes)) + + def test_convert_list_item_block_html_smoke(self): + token = {"type": "block_html", "raw": "
hi
\n\ntext"} + nodes = self.p._convert_list_item_block_html(token) + self.assertTrue(any(isinstance(n, Text) for n in nodes)) + + def test_rendering_roundtrip_smoke(self): + md = "some text [link](url) new text" + tree = parser.parse(md, parser_cls=MdParser) + self.assertEqual(md, TextRenderer().render(tree)) From ada6821b16f56c20490e8d2f44879c75d902cad3 Mon Sep 17 00:00:00 2001 From: Philipp Berner <374326+philippb@users.noreply.github.com> Date: Fri, 6 Feb 2026 13:30:29 -0800 Subject: [PATCH 18/18] chore: ignore node_modules --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ce77503..8f655ec 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ target/ venv/ .DS_Store .idea/ +node_modules/