diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..7148fd6 --- /dev/null +++ b/.flake8 @@ -0,0 +1,10 @@ +[flake8] +max-line-length = 120 +ignore = F403,F405 +exclude = + .git, + __pycache__, + venv, + build, + dist, + sdiff.egg-info diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..bf9a1e8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,37 @@ +name: CI + +on: + workflow_dispatch: + pull_request: + types: [opened, synchronize, reopened, ready_for_review] + push: + branches: [master] + +jobs: + test: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + cache: "pip" + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + python -m pip install .[tests] + + - name: Format check + run: python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests + + - name: Lint + run: python -m flake8 --config .flake8 sdiff tests + + - name: Test + run: python -m coverage run -m pytest -s --durations=3 --durations-min=0.005 + + - name: Coverage report + run: python -m coverage report -m diff --git a/.gitignore b/.gitignore index ce77503..8f655ec 100644 --- a/.gitignore +++ b/.gitignore @@ -56,3 +56,4 @@ target/ venv/ .DS_Store .idea/ +node_modules/ diff --git a/.husky/pre-commit b/.husky/pre-commit new file mode 100755 index 0000000..bc7696e --- /dev/null +++ b/.husky/pre-commit @@ -0,0 +1,5 @@ +#!/usr/bin/env sh +. "$(dirname -- "$0")/_/husky.sh" + +python -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests +python -m flake8 --config .flake8 sdiff tests diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index df31221..0000000 --- a/.travis.yml +++ /dev/null @@ -1,11 +0,0 @@ -language: python -dist: jammy -python: - - "3.11" -# command to install dependencies -install: - - make dev -# command to run tests -script: - - make test - - make coverage diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..526549e --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,32 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The core library lives in `sdiff/` (parser, comparer, renderer, and models). Tests are in `tests/`, with shared fixtures in `tests/fixtures/`. Reference PDFs sit in `docs/`. Packaging and tooling are defined in `setup.py`, `setup.cfg`, and the `Makefile`; `CHANGELOG` tracks releases. + +## Build, Test, and Development Commands +- `make env` creates the local `venv/` (Python 3.11+). +- `make dev` installs the package plus test/dev extras (`.[tests,devtools]`) into the venv. +- `make test` runs linting and the full pytest suite with coverage. +- `make vtest` runs pytest verbosely. +- `make flake` runs the autopep8 format check and flake8 on `sdiff/` and `tests/`. +- `make format` applies autopep8 formatting to `sdiff/` and `tests/`. +- `make cov` prints the coverage report. +- `make clean` removes build artifacts and the venv. +- `make hooks` installs Husky git hooks (requires Node/npm; `make dev` runs this). + +Lint parity: CI and the Husky pre-commit hook both run the same checks as `make flake` (autopep8 check + flake8). Run `make flake` or `make test` locally to mirror CI. + +Example flow: +```sh +make dev +make test +``` + +## Coding Style & Naming Conventions +Use standard Python conventions: 4-space indentation, `snake_case` for modules/functions/variables, and `PascalCase` for classes. Flake8 enforces a 120-character line limit (see `setup.cfg`). `autopep8` is available for formatting. Keep new modules in `sdiff/` and new tests in `tests/` with filenames like `test_.py`. + +## Testing Guidelines +The suite uses `pytest` with `coverage`. Coverage is expected to stay high (current config fails under 96%). Add or update tests for behavior changes, and prefer small, focused unit tests. Place reusable data in `tests/fixtures/`. Run `make test` before submitting changes. + +## Commit & Pull Request Guidelines +Commit messages in this repo are short and often use a type prefix (e.g., `chore: ...`, `fixes: ...`, `hotfix: ...`, `refactors: ...`). Follow that pattern where practical, and keep the summary concise. For PRs, include a brief description, list tests run (e.g., `make test`), and link related issues or tickets when available. diff --git a/Makefile b/Makefile index 6eeb1e2..4be00c9 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,7 @@ env: dev: env update $(PIP) install .[tests,devtools] + @$(MAKE) hooks install: env update @@ -28,8 +29,20 @@ publish: $(TWINE) upload --verbose --sign --username developer --repository-url http://$(PYPICLOUD_HOST)/simple/ dist/*.whl flake: + $(PYTHON) -m autopep8 --exit-code --diff --max-line-length 120 -r sdiff tests $(FLAKE) sdiff tests +format: + $(PYTHON) -m autopep8 --in-place --max-line-length 120 -r sdiff tests + +hooks: + @if command -v npm >/dev/null 2>&1; then \ + npm install --no-package-lock --silent; \ + npm run --silent prepare; \ + else \ + echo "npm not found; skipping husky install"; \ + fi + test: flake $(COVERAGE) run -m pytest $(TEST_RUNNER_FLAGS) @@ -57,4 +70,4 @@ clean: rm -rf venv -.PHONY: all build env linux run pep test vtest testloop cov clean +.PHONY: all build env linux run pep test vtest testloop cov clean hooks format diff --git a/README.md b/README.md index b8bb2a8..7ab5d32 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,40 @@ # md-sdiff -Diffs to markdown texts only based on their structure. Ignores content. Helpful to diff 2 files that contain the same content in different languages. + +Structural diffs for Markdown. The library parses two Markdown inputs into a lightweight tree and compares the *shape* (headings, lists, paragraphs, links, etc.) instead of the text content. This is useful when you expect the same document structure across translations or when you want to validate formatting consistency without caring about the wording. + +## What it does +- Parses Markdown into an AST-like node tree using `mistune`. +- Compares trees node-by-node and flags insertions/deletions in structure. +- Returns a rendered view of each document plus a list of structural errors. +- Supports a Zendesk-specific parser (`ZendeskHelpMdParser`) for ``, ``, and `` blocks. + +## Example usage +```python +from sdiff import diff, TextRenderer, MdParser + +left = "# Title\n\n- One\n- Two" +right = "# Title\n\n- One\n- Two\n- Three" + +rendered_left, rendered_right, errors = diff(left, right, renderer=TextRenderer(), parser_cls=MdParser) +print(errors[0]) # "There is a missing element `li`." +``` + +## Renderers +`TextRenderer` returns the original Markdown structure as text. `HtmlRenderer` wraps the output and marks structural insertions/deletions with `` and ``. + +## One-off usage +```sh +python - <<'PY' +from sdiff import diff, TextRenderer + +left = open("left.md", "r", encoding="utf-8").read() +right = open("right.md", "r", encoding="utf-8").read() +_, _, errors = diff(left, right, renderer=TextRenderer()) + +for err in errors: + print(err) +PY +``` + +## Notes +This project is a library (no CLI). If you need different token handling, you can provide a custom parser class that extends `MdParser`. diff --git a/package.json b/package.json new file mode 100644 index 0000000..d682872 --- /dev/null +++ b/package.json @@ -0,0 +1,10 @@ +{ + "name": "html-structure-diff", + "private": true, + "devDependencies": { + "husky": "^9.0.0" + }, + "scripts": { + "prepare": "husky install" + } +} diff --git a/requirements.txt b/requirements.txt index 1f202e5..a234623 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -mistune==0.8.1 +mistune==3.2.0 diff --git a/sdiff/__init__.py b/sdiff/__init__.py index 853d12c..85b6af4 100644 --- a/sdiff/__init__.py +++ b/sdiff/__init__.py @@ -4,13 +4,21 @@ def diff(md1, md2, renderer=TextRenderer(), parser_cls: type[MdParser] = MdParser): + """Compare two Markdown strings by structure and return rendered outputs + errors. + + Args: + md1: Left Markdown string. + md2: Right Markdown string. + renderer: Renderer instance used to format the output (TextRenderer by default). + parser_cls: Parser class to use (MdParser by default). + + Returns: + (rendered_left, rendered_right, errors) + """ tree1 = parse(md1, parser_cls) tree2 = parse(md2, parser_cls) tree1, tree2, struct_errors = diff_struct(tree1, tree2) - # tree1, tree2, links_errors = diff_links(tree1, tree2) - - # errors = struct_errors + links_errors errors = struct_errors return renderer.render(tree1), renderer.render(tree2), errors diff --git a/sdiff/compare.py b/sdiff/compare.py index 5958ada..34d75ca 100644 --- a/sdiff/compare.py +++ b/sdiff/compare.py @@ -44,8 +44,10 @@ def _diff(tree1, tree2, include_symbols=None, exclude_symbols=None): def diff_links(tree1, tree2): + """Diff only link-relevant structure (paragraphs/headers/lists/links).""" return _diff(tree1, tree2, include_symbols=['p', 'h', 'l', 'a']) def diff_struct(tree1, tree2): + """Diff overall structure, ignoring link and image content.""" return _diff(tree1, tree2, exclude_symbols=['a', 'i']) diff --git a/sdiff/parser.py b/sdiff/parser.py index 93a4736..765ef12 100644 --- a/sdiff/parser.py +++ b/sdiff/parser.py @@ -1,207 +1,1162 @@ -from re import Match +import re +import textwrap +from typing import Iterable +from urllib.parse import unquote import mistune -import re +from mistune import block_parser -from .model import * +from .model import (Html, Image, Link, List, ListItem, NewLine, Paragraph, Root, + Text, Header, ZendeskHelpCallout, ZendeskHelpSteps, + ZendeskHelpTabs) +_HEADING_LINE_RE = re.compile(r'^(\s*)(#{1,6})(?!#)(?=\S)') +_ATX_HEADING_NO_SPACE_RE = re.compile(r'^(\s{0,3})(#{1,6})(?!#)(?=\S)') +_LIST_ITEM_ATX_HEADING_NO_SPACE_RE = re.compile(r'^(\s{0,3}(?:[*+-]|\d+[.)])\s+)(#{1,6})(?!#)(?=\S)') +_LIST_MARKER_RE = re.compile(r'^\s{0,3}(?:[*+-]|\d+[.)])\s+') +_ORDERED_LIST_MARKER_RE = re.compile(r'^\s{0,3}(\d+)[.)]\s+') +_REF_LINK_OR_IMAGE_RE = re.compile(r'!?\[[^\]]+\]\s*\[[^\]]*\]') +_REF_DEF_LINE_RE = re.compile(r'^\s{0,3}\[[^\]]+\]:\s+\S+') +_FENCE_RE = re.compile(r'^\s*(`{3,}|~{3,})') +_FENCE_ONLY_LINE_RE = re.compile(r'^\s*(`{3,}|~{3,})\s*$') +_BLOCKQUOTE_LINE_RE = re.compile(r'^\s{0,3}>\s?.*') +_MISTUNE08_FENCE_BLOCK_RE = re.compile( + r'^ *(`{3,}|~{3,}) *(\S+)? *\n' # opening fence (+ optional info) + r'([\s\S]+?)\s*' # content (must be non-empty; mistune 0.x quirk) + r'\1 *(?:\n+|$)', # closing fence + flags=re.M, +) +_INLINE_MARKERS = { + 'strong': '**', + 'emphasis': '*', + 'strikethrough': '~~', +} -class InlineLexer(mistune.BlockLexer): - grammar_class = mistune.InlineGrammar +_LEGACY_INLINE_TAGS = { + # Copied from mistune 0.8.1's `_block_tag` negative lookahead. + 'a', + 'em', + 'strong', + 'small', + 's', + 'cite', + 'q', + 'dfn', + 'abbr', + 'data', + 'time', + 'code', + 'var', + 'samp', + 'kbd', + 'sub', + 'sup', + 'i', + 'b', + 'u', + 'mark', + 'ruby', + 'rt', + 'rp', + 'bdi', + 'bdo', + 'span', + 'br', + 'wbr', + 'ins', + 'del', + 'img', + 'font', +} - default_rules = [ - 'linebreak', 'link', - 'reflink', 'text', - ] +_MISTUNE_BLOCK_OR_PRE_TAGS = set(block_parser.BLOCK_TAGS) | set(block_parser.PRE_TAGS) - def __init__(self): - self.links = {} - self.grammar_class.text = re.compile(r'^ {1,}\n|^[\s\S]+?(?=[\[`~]| {2,}\n|$)') - super().__init__() - - def parse_autolink(self, m): - self.tokens.append(Link(m.group(0))) - - def parse_url(self, m): - self.tokens.append(Link(m.group(0))) - - def parse_link(self, m): - return self._process_link(m) - - def parse_reflink(self, m): - # TODO skip this check for now - # key = mistune._keyify(m.group(2) or m.group(1)) - # if key not in self.links: - # return None - # ret = self.links[key] - return self._process_link(m) - - def _process_link(self, m): - line = m.group(0) - if line[0] == '!': - node = Image(line) - else: - node = Link(line) +_LEGACY_VALID_ATTR_RE = r"\s*[a-zA-Z\-](?:\=(?:\"[^\"]*\"|'[^']*'|[^\s'\">]+))?" +_LEGACY_BLOCK_TAG_RE = ( + r"(?!(?:%s)\b)\w+(?!:/|[^\w\s@]*@)\b" % "|".join(sorted(_LEGACY_INLINE_TAGS)) +) +_LEGACY_BLOCK_HTML_RE = re.compile( + r'^\s* *(?:' + r'' + r'|<(' + _LEGACY_BLOCK_TAG_RE + r')((?:' + _LEGACY_VALID_ATTR_RE + r')*?)>([\s\S]+?)<\/\1>' + r'|<' + _LEGACY_BLOCK_TAG_RE + r'(?:' + _LEGACY_VALID_ATTR_RE + r')*?>' + r') *(?:\n{1,}|\s*$)' +) - self.tokens.append(node) - def parse_linebreak(self, m): - node = NewLine() - self.tokens.append(node) +def _split_legacy_block_html(raw: str) -> tuple[str, str] | None: + """Split over-greedy HTML blocks produced by mistune 3. - def parse_text(self, m): - text = m.group(0) - if text.strip(): - escaped_text = mistune.escape(text) - node = Text(escaped_text) - self.tokens.append(node) + Mistune 0.x treats a line like `` as a single HTML block and continues parsing + following Markdown lines. Mistune 3 follows CommonMark and may consume subsequent lines + until a blank line, which changes our structural tree. + """ + if not raw or '\n' not in raw: + return None + match = _LEGACY_BLOCK_HTML_RE.match(raw) + if match is None: + return None + end = match.end() + if end >= len(raw): + return None + return raw[:end], raw[end:] + + +class _SdiffBlockParser(block_parser.BlockParser): + """Mistune block parser tweaked for legacy-compat structure diffs. + + The master branch (mistune 0.x) did not treat fenced code blocks or block quotes + as special blocks. We disable them so they are parsed as normal text and then + normalized in our conversion layer. + """ + + def parse_fenced_code(self, m, state): # noqa: ANN001 + return None + + def parse_block_quote(self, m, state): # noqa: ANN001 + return None + + def parse_raw_html(self, m, state): # noqa: ANN001 + """Parse raw HTML more like mistune 0.x. + + In mistune 3, unknown tags are "type 7" HTML blocks and may not interrupt + paragraphs. The legacy mistune 0.x parser used in `master` treats any + non-inline tag as block HTML and it can interrupt paragraphs. + """ + marker = m.group(0).strip() + # Legacy parser does not recognize closing tags alone as block HTML. + if marker.startswith('', - r'<({})((?:{})*?)>([\s\S]+?)<\/\1>'.format(mistune._block_tag, mistune._valid_attr), - r'<{}(?:{})*?>'.format(mistune._block_tag, mistune._valid_attr), - ) - ) - - def _parse_inline(self, text): - inline = InlineLexer() - return inline.parse(text) - - def parse_newline(self, m): - length = len(m.group(0)) - if length > 1: - self.tokens.append(NewLine()) - - def parse_heading(self, m): - level = len(m.group(1)) - node = Header(level) - node.add_nodes(self._parse_inline(m.group(2))) - self.tokens.append(node) - - def parse_lheading(self, m): - level = 1 if m.group(2) == '=' else 2 - text = m.group(1) - node = Header(level) - node.add_nodes(self._parse_inline(text)) - self.tokens.append(node) - - def parse_block_html(self, m): - text = m.group(0) - html = Html(text) - self.tokens.append(html) - - def parse_paragraph(self, m): - text = m.group(1).rstrip('\n') - node = Paragraph() - node.add_nodes(self._parse_inline(text)) - self.tokens.append(node) - - def parse_text(self, m): - text = m.group(0) - escaped_text = mistune.escape(text) - node = Text(escaped_text) - self.tokens.append(node) - - def parse_list_block(self, m): - bull = m.group(2) - cap = m.group(0) - ordered = '.' in bull - node = List(ordered) - node.add_nodes(self._process_list_item(cap, bull)) - self.tokens.append(node) - - def _process_list_item(self, cap, bull): - result = [] - cap = self.rules.list_item.findall(cap) - - _next = False - length = len(cap) - - for i in range(length): - item = cap[i][0] - - # remove the bullet - space = len(item) - item = self.rules.list_bullet.sub('', item) - - # outdent - if '\n ' in item: - space = space - len(item) - pattern = re.compile(r'^ {1,%d}' % space, flags=re.M) - item = pattern.sub('', item) - - # determine whether item is loose or not - loose = _next - if not loose and re.search(r'\n\n(?!\s*$)', item): - loose = True - - rest = len(item) - if i != length - 1 and rest: - _next = item[rest - 1] == '\n' - if not loose: - loose = _next - - node = ListItem() - block_lexer = self.get_lexer() - nodes = block_lexer.parse(item, self.list_rules) - node.add_nodes(nodes) - result.append(node) - return result + block = _SdiffBlockParser() + # Don't recognize fences/quotes as block-level syntax; see _SdiffBlockParser. + for rule in ('fenced_code', 'block_quote'): + if rule in block.rules: + block.rules.remove(rule) + + # In mistune 0.x the list parser does not include the `block_html` / `raw_html` + # rule, so HTML-like lines inside list items become plain text (not Html nodes) + # and don't swallow following Markdown. + if 'raw_html' in getattr(block, 'list_rules', []): + block.list_rules.remove('raw_html') + + inline = mistune.InlineParser() + # Prevent code spans from consuming legacy fence markers like ```...```. + if 'codespan' in inline.rules: + inline.rules.remove('codespan') + + self._markdown = mistune.Markdown(renderer=None, block=block, inline=inline) + self._reference_definitions = {} + + def parse(self, text, rules=None): + """Parse Markdown text into a list of Node objects. + + Args: + text: Markdown string. + rules: Optional rules argument kept for compatibility. + + Returns: + list[Node] + """ + tokens = self._markdown(text) + return self._convert_block_tokens(tokens) + + def _set_reference_definitions(self, definitions): + self._reference_definitions = definitions + + def _convert_block_tokens(self, tokens: Iterable[dict]): + nodes = [] + for token in tokens: + nodes.extend(self._convert_block_token(token)) + return nodes + + def _convert_block_token(self, token): + token_type = token.get('type') + if token_type == 'paragraph': + return self._convert_paragraph_token(token.get('children', [])) + if token_type == 'heading': + return [self._convert_heading(token)] + if token_type == 'list': + return [self._convert_list(token)] + if token_type == 'list_item': + return [self._convert_list_item(token)] + if token_type == 'block_text': + return [self._convert_paragraph_or_heading(token.get('children', []))] + if token_type == 'block_html': + return self._convert_block_html(token) + if token_type == 'block_quote': + return self._convert_block_quote(token) + if token_type == 'block_code': + return self._convert_block_code(token) + if token_type == 'thematic_break': + return self._convert_passthrough_block(token) + return self._convert_passthrough_block(token) + + def _convert_heading(self, token): + level = token.get('level') or token.get('attrs', {}).get('level', 1) + header = Header(level) + header.add_nodes(self._convert_inline_tokens(token.get('children', []))) + return header + + def _convert_list(self, token): + ordered = token.get('ordered') + if ordered is None: + ordered = token.get('attrs', {}).get('ordered', False) + list_node = List(bool(ordered)) + for item in token.get('children', []): + list_node.add_node(self._convert_list_item(item)) + return list_node + + def _convert_block_html(self, token): + raw = token.get('raw', '') + if _is_block_html(raw): + split = _split_legacy_block_html(raw) + if split is None: + return [Html(raw)] + prefix, suffix = split + nodes = [Html(prefix)] + if suffix and suffix.strip(): + nodes.extend(self._convert_block_tokens(self._markdown(suffix))) + return nodes + text = mistune.escape(raw) + if text.strip(): + return [Paragraph([Text(text)])] + return [] + + def _convert_passthrough_block(self, token): + child_nodes = self._convert_block_tokens(token.get('children', [])) + if child_nodes: + return child_nodes + raw = token.get('raw') or token.get('text') or '' + if raw.strip(): + return [Paragraph([Text(mistune.escape(raw))])] + return [] + + def _convert_block_quote(self, token): + children = token.get('children', []) + if not children: + return [] + content = self._render_inline_children(children) + if not content.strip(): + return [] + lines = content.splitlines() + quoted = '\n'.join([f'> {line}' if line.strip() else '>' for line in lines]) + return [Paragraph([Text(mistune.escape(quoted))])] + + def _convert_block_code(self, token): + raw = token.get('raw') or '' + marker = token.get('marker') or '```' + fence = marker if marker else '```' + content = raw.rstrip('\n') + code_block = f'{fence}\n{content}\n{fence}' + return [Paragraph([Text(mistune.escape(code_block))])] + + def _render_inline_children(self, children): + parts = [] + for child in children: + child_type = child.get('type') + if child_type in {'paragraph', 'block_text'}: + parts.append(self._flatten_inline_text(child.get('children', []))) + else: + raw = child.get('raw') or child.get('text') or '' + if raw: + parts.append(raw) + return '\n'.join([part for part in parts if part is not None]) + + def _convert_list_item(self, token): + item = ListItem() + for child in token.get('children', []): + child_type = child.get('type') + if child_type in {'block_text', 'paragraph'}: + item.add_nodes(self._convert_list_block_nodes(child.get('children', []))) + elif child_type == 'block_html': + item.add_nodes(self._convert_list_item_block_html(child)) + else: + item.add_nodes(self._convert_block_tokens([child])) + return item + + def _convert_list_item_block_html(self, token): + # In mistune 0.x the list parser does not include the `block_html` rule, + # so HTML-like lines inside list items become plain text (not Html nodes). + raw = token.get('raw', '') or '' + if not raw.strip(): + return [] + + split = _split_legacy_block_html(raw) + if split is None: + prefix, suffix = raw, '' + else: + prefix, suffix = split + + nodes = [] + _append_text(nodes, mistune.escape(prefix)) + if suffix and suffix.strip(): + nodes.extend(self._convert_list_item_block_html_text(suffix)) + return nodes + + def _convert_list_item_block_html_text(self, text: str): + nodes = [] + for child in self._markdown(text): + child_type = child.get('type') + if child_type in {'block_text', 'paragraph'}: + nodes.extend(self._convert_list_block_nodes(child.get('children', []))) + elif child_type == 'heading': + nodes.append(self._convert_heading(child)) + elif child_type == 'list': + nodes.append(self._convert_list(child)) + elif child_type == 'list_item': + nodes.append(self._convert_list_item(child)) + elif child_type == 'block_html': + nodes.extend(self._convert_list_item_block_html(child)) + else: + raw = child.get('raw') or child.get('text') or '' + if raw.strip(): + _append_text(nodes, mistune.escape(raw)) + return nodes + + def _convert_inline_tokens(self, tokens: Iterable[dict]): + nodes = [] + buffer = '' + + def flush_buffer(): + nonlocal buffer + if buffer: + for part in _split_text_on_legacy_markers(buffer): + self._split_reference_links(part, nodes) + buffer = '' + + handlers = { + 'text': self._handle_inline_text, + 'inline_html': self._handle_inline_text, + 'block_html': self._handle_inline_text, + 'codespan': self._handle_inline_codespan, + 'softbreak': self._handle_inline_softbreak, + 'linebreak': self._handle_inline_linebreak, + 'link': self._handle_inline_link, + 'image': self._handle_inline_image, + 'strong': self._handle_inline_marker, + 'emphasis': self._handle_inline_marker, + 'strikethrough': self._handle_inline_marker, + } + + for token in tokens: + token_type = token.get('type') + handler = handlers.get(token_type) + if handler: + buffer = handler(token, nodes, buffer, flush_buffer) + else: + buffer = self._handle_inline_other(token, nodes, buffer, flush_buffer) + + flush_buffer() + return nodes + + def _handle_inline_text(self, token, nodes, buffer, flush_buffer): + raw = token.get('raw', '') + buffer += self._reference_definitions.get(raw, raw) + return buffer + + def _handle_inline_codespan(self, token, nodes, buffer, flush_buffer): + buffer += f"`{token.get('raw') or token.get('text') or ''}`" + return buffer + + def _handle_inline_softbreak(self, token, nodes, buffer, flush_buffer): + buffer += ' ' + return buffer + + def _handle_inline_linebreak(self, token, nodes, buffer, flush_buffer): + flush_buffer() + nodes.append(NewLine()) + return '' + + def _handle_inline_link(self, token, nodes, buffer, flush_buffer): + flush_buffer() + text = self._flatten_inline_text(token.get('children', [])) + attrs = token.get('attrs', {}) + url = _unquote_url_if_template(attrs.get('url', '')) + title = attrs.get('title') + nodes.append(Link(_format_link_markup(text, url, title))) + return '' + + def _handle_inline_image(self, token, nodes, buffer, flush_buffer): + flush_buffer() + alt = token.get('attrs', {}).get('alt') or self._flatten_inline_text(token.get('children', [])) + attrs = token.get('attrs', {}) + url = _unquote_url_if_template(attrs.get('url', '')) + title = attrs.get('title') + nodes.append(Image(_format_image_markup(alt, url, title))) + return '' + + def _handle_inline_marker(self, token, nodes, buffer, flush_buffer): + flush_buffer() + marker = _INLINE_MARKERS[token.get('type')] + _append_text(nodes, marker) + children = token.get('children', []) + if children: + nodes.extend(self._convert_inline_tokens(children)) + _append_text(nodes, marker) + return '' + + def _handle_inline_other(self, token, nodes, buffer, flush_buffer): + flush_buffer() + children = token.get('children', []) + if children: + nodes.extend(self._convert_inline_tokens(children)) + else: + raw = token.get('raw') or token.get('text') or '' + if raw.strip(): + _append_text(nodes, mistune.escape(raw)) + return '' + + def _flatten_inline_text(self, tokens: Iterable[dict]): + parts = [] + for token in tokens: + token_type = token.get('type') + if token_type in {'text', 'inline_html', 'block_html'}: + raw = token.get('raw') or token.get('text') or '' + parts.append(self._reference_definitions.get(raw, raw)) + elif token_type == 'codespan': + parts.append(f"`{token.get('raw') or token.get('text') or ''}`") + elif token_type in _INLINE_MARKERS: + marker = _INLINE_MARKERS[token_type] + inner = self._flatten_inline_text(token.get('children', [])) + parts.append(f'{marker}{inner}{marker}') + elif token_type in {'linebreak', 'softbreak'}: + parts.append(' ') + else: + children = token.get('children', []) + if children: + parts.append(self._flatten_inline_text(children)) + else: + parts.append(token.get('raw') or token.get('text') or '') + return ''.join(parts).strip() + + def _convert_paragraph_or_heading(self, inline_tokens: Iterable[dict]): + ref_text = self._reference_definition_text(inline_tokens) + if ref_text is not None: + return Paragraph([Text(ref_text)]) + heading = self._heading_from_inline(inline_tokens) + if heading: + return heading + return Paragraph(self._convert_inline_tokens(inline_tokens)) + + def _convert_paragraph_token(self, inline_tokens: Iterable[dict]): + ref_text = self._reference_definition_text(inline_tokens) + if ref_text is not None: + return [Paragraph([Text(ref_text)])] + heading = self._heading_from_inline(inline_tokens) + if heading: + return [heading] + + split = self._split_paragraph_inline_on_fence(inline_tokens) + if split is not None: + nodes = [] + for part in split: + children = self._convert_inline_tokens(part) + if children: + nodes.append(Paragraph(children)) + if nodes: + return nodes + + return [Paragraph(self._convert_inline_tokens(inline_tokens))] + + def _split_paragraph_inline_on_fence(self, inline_tokens: Iterable[dict]): + # Legacy mistune 0.x breaks paragraphs when it encounters a fence-only marker + # line (``` / ~~~), even though we treat fences as plain text blocks. + if not inline_tokens: + return None + + lines = [[]] + seps = [] + for token in inline_tokens: + token_type = token.get('type') + if token_type in {'softbreak', 'linebreak'}: + seps.append(token) + lines.append([]) + else: + lines[-1].append(token) + + if len(lines) <= 1: + return None + + line_texts = [self._flatten_inline_markup(line) for line in lines] + + def fence_marker(tokens): + raw = self._flatten_inline_markup(tokens).strip() + match = _FENCE_ONLY_LINE_RE.match(raw) + if match is None: + return None + return match.group(1) + + if fence_marker(lines[0]) is not None: + return None + + split_idx = None + for idx in range(1, len(lines)): + marker = fence_marker(lines[idx]) + if marker is None: + continue + # Only split when this fence line begins a complete fence block according + # to mistune 0.x's `fences` regex. This avoids breaking on sequences like + # ```\n``` which mistune 0.x does not treat as a fence block (no content). + tail = '\n'.join(line_texts[idx:]) + if _MISTUNE08_FENCE_BLOCK_RE.match(tail): + split_idx = idx + break + + if split_idx is None: + return None + + first = [] + for idx, line in enumerate(lines[:split_idx]): + first.extend(line) + if idx < split_idx - 1: + first.append(seps[idx]) + + second = [] + for line_idx in range(split_idx, len(lines)): + second.extend(lines[line_idx]) + if line_idx < len(lines) - 1: + second.append(seps[line_idx]) + + parts = [] + if first: + parts.append(first) + if second: + parts.append(second) + return parts if len(parts) > 1 else None + + def _convert_list_block_nodes(self, inline_tokens: Iterable[dict]): + text = self._flatten_inline_markup(inline_tokens, softbreak_as_newline=True) + if not text or not text.strip(): + return [] + + nodes = [] + for line in text.splitlines(): + if not line.strip(): + continue + + ref_text = self._reference_definitions.get(line) + if ref_text is not None: + nodes.append(Text(ref_text)) + continue + + heading = self._heading_from_inline([{'type': 'text', 'raw': line}]) + if heading: + nodes.append(heading) + continue + + nodes.append(Text(mistune.escape(line))) + + return nodes + + def _flatten_inline_markup(self, tokens: Iterable[dict], *, softbreak_as_newline: bool = False): + parts = [] + for token in tokens: + token_type = token.get('type') + if token_type in {'text', 'inline_html', 'block_html'}: + raw = token.get('raw') or token.get('text') or '' + parts.append(self._reference_definitions.get(raw, raw)) + elif token_type == 'link': + label = self._flatten_inline_markup( + token.get('children', []), + softbreak_as_newline=softbreak_as_newline, + ) + attrs = token.get('attrs', {}) + url = _unquote_url_if_template(attrs.get('url', '')) + title = attrs.get('title') + parts.append(_format_link_markup(label, url, title)) + elif token_type == 'image': + alt = token.get('attrs', {}).get('alt') or self._flatten_inline_markup( + token.get('children', []), + softbreak_as_newline=softbreak_as_newline, + ) + attrs = token.get('attrs', {}) + url = _unquote_url_if_template(attrs.get('url', '')) + title = attrs.get('title') + parts.append(_format_image_markup(alt, url, title)) + elif token_type == 'softbreak': + parts.append('\n' if softbreak_as_newline else ' ') + elif token_type == 'linebreak': + parts.append('\n') + elif token_type == 'codespan': + parts.append(f"`{token.get('raw') or token.get('text') or ''}`") + elif token_type in _INLINE_MARKERS: + marker = _INLINE_MARKERS[token_type] + inner = self._flatten_inline_markup( + token.get('children', []), + softbreak_as_newline=softbreak_as_newline, + ) + parts.append(f'{marker}{inner}{marker}') + else: + children = token.get('children', []) + if children: + parts.append(self._flatten_inline_markup(children, softbreak_as_newline=softbreak_as_newline)) + else: + parts.append(token.get('raw') or token.get('text') or '') + return ''.join(parts) + + def _heading_from_inline(self, inline_tokens: Iterable[dict]): + if len(inline_tokens) != 1: + return None + token = inline_tokens[0] + if token.get('type') != 'text': + return None + raw = token.get('raw', '') + match = _HEADING_LINE_RE.match(raw) + if not match: + return None + level = len(match.group(2)) + content = raw[match.end(2):].lstrip() + heading_tokens = self._markdown(f"{'#' * level} {content}") + if heading_tokens and heading_tokens[0].get('type') == 'heading': + children = heading_tokens[0].get('children', []) + else: + children = [{'type': 'text', 'raw': content}] + header = Header(level) + header.add_nodes(self._convert_inline_tokens(children)) + return header + + def _reference_definition_text(self, inline_tokens: Iterable[dict]): + if len(inline_tokens) != 1: + return None + token = inline_tokens[0] + if token.get('type') != 'text': + return None + raw = token.get('raw', '') + return self._reference_definitions.get(raw) + + def _split_reference_links(self, raw: str, nodes): + last = 0 + for match in _REF_LINK_OR_IMAGE_RE.finditer(raw): + if match.start() > last: + _append_text(nodes, mistune.escape(raw[last:match.start()])) + snippet = match.group(0) + if snippet.startswith('!['): + nodes.append(Image(snippet)) + else: + nodes.append(Link(snippet)) + last = match.end() + if last < len(raw): + _append_text(nodes, mistune.escape(raw[last:])) + return nodes class ZendeskHelpMdParser(MdParser): - TAG_CONTENT_GROUP = 'tag_content' - TAG_PATTERN = r'^\s*(<{tag_name}{attr_re}>(?P<%s>[\s\S]+?))\s*$' % TAG_CONTENT_GROUP - CALLOUT_STYLE_GROUP = 'style' - CALLOUT_ATTR_PATTERN = r'( (?P<%s>green|red|yellow))*' % CALLOUT_STYLE_GROUP + _CALLOUT_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*[^>]*)>(?P.*?)') + _CALLOUT_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*[^>]*)>(?P.*)') + _STEPS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*(?P.*?)') + _STEPS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*(?P.*)') + _TABS_PATTERN_MIN = re.compile(r'(?sm)^[ \t]*(?P.*?)') + _TABS_PATTERN_MAX = re.compile(r'(?sm)^[ \t]*(?P.*)') + + def parse(self, text, rules=None): + """Parse Markdown with Zendesk tag support into a list of Node objects.""" + nodes = self._parse_nodes(text) + return nodes + + def _parse_nodes(self, text: str): + nodes = [] + remaining = text + while remaining: + tag_name = None + match = None + search_at = 0 + while True: + tag_name, match = self._find_next_tag(remaining, start_at=search_at) + if not match: + break + absolute_start = (len(text) - len(remaining)) + match.start() + if _is_inside_list_block(text, absolute_start): + # The legacy mistune 0.x list parser treats block-level content + # lazily; Zendesk tags that appear inside list items become plain + # text and are not recognized structurally. Avoid splitting the + # input at such tags, since that would terminate the list early. + search_at = match.start() + 1 + continue + break + if not match: + nodes.extend(self._parse_markdown(_normalize_block_indentation(remaining))) + break + + if match.start() > 0: + prefix = remaining[:match.start()] + nodes.extend(self._parse_markdown(_normalize_block_indentation(prefix))) + + # The legacy parser only recognizes Zendesk tags when they consume the + # remainder of the current parsing slice (it uses `\\s*$` in the rule + # regex). Because of this, it will also match *across* multiple tag + # blocks of the same kind if the last closing tag is at the end. + # + # We emulate this by preferring a greedy match when it is terminal. + terminal_match = None + tail = remaining[match.start():] + if tag_name == 'callout': + m2 = self._CALLOUT_PATTERN_MAX.match(tail) + elif tag_name == 'steps': + m2 = self._STEPS_PATTERN_MAX.match(tail) + else: + m2 = self._TABS_PATTERN_MAX.match(tail) + if m2 is not None and not tail[m2.end():].strip(): + terminal_match = m2 + + if terminal_match is None: + # Non-terminal: treat the first (minimal) tag block as opaque HTML. + nodes.append(Html(match.group(0))) + remaining = remaining[match.end():] + continue + + content = terminal_match.group('content') + trailing = tail[terminal_match.end():] + + if tag_name == 'callout': + attrs = (terminal_match.group('attrs') or '').strip() + styles = [part for part in attrs.split() if part] + if not styles: + node = ZendeskHelpCallout(None) + elif len(styles) == 1 and styles[0] in {'green', 'red', 'yellow'}: + node = ZendeskHelpCallout(styles[0]) + else: + # Invalid callout attrs: legacy parser does not treat this as a + # Zendesk callout block. Keep the first (minimal) tag as opaque + # HTML and continue parsing the remaining text. + nodes.append(Html(match.group(0))) + remaining = remaining[match.end():] + continue + elif tag_name == 'steps': + node = ZendeskHelpSteps() + else: + node = ZendeskHelpTabs() + + node.add_nodes(self._parse_nodes(content)) + nodes.append(node) + + remaining = trailing + return nodes + + def _find_next_tag(self, text: str, start_at: int = 0): + best = None + for name, pattern in ( + ('callout', self._CALLOUT_PATTERN_MIN), + ('steps', self._STEPS_PATTERN_MIN), + ('tabs', self._TABS_PATTERN_MIN), + ): + for match in pattern.finditer(text, start_at): + candidate = (match.start(), name, match) + if best is None or candidate[0] < best[0]: + best = candidate + break + + if best is None: + return None, None + _, name, match = best + return name, match + + def _parse_markdown(self, text: str): + normalized = _remove_spaces_from_empty_lines(text) + normalized = _remove_ltr_rtl_marks(normalized) + return self._convert_block_tokens(self._markdown(normalized)) + + +def _append_text(nodes, text): + if not text or not text.strip(): + return + nodes.append(Text(text)) - def __init__(self): - super().__init__() - self.grammar_class.callout = re.compile(self.TAG_PATTERN.format(tag_name='callout', - attr_re=self.CALLOUT_ATTR_PATTERN)) - self.default_rules.insert(0, 'callout') - self.grammar_class.steps = re.compile(self.TAG_PATTERN.format(tag_name='steps', attr_re='')) - self.default_rules.insert(0, 'steps') +def _split_text_on_legacy_markers(raw: str) -> list[str]: + """Split text into segments similar to mistune 0.x inline text tokenization. - self.grammar_class.tabs = re.compile(self.TAG_PATTERN.format(tag_name='tabs', attr_re='')) - self.default_rules.insert(0, 'tabs') + The legacy parser splits text at backticks and tildes (it stops before those + markers and then consumes them as separate text tokens). This matters for our + structural tree because each segment becomes its own Text node. + """ + if not raw: + return [] + markers = ('`', '~') + out = [] + i = 0 + n = len(raw) + while i < n: + j = n + for m in markers: + pos = raw.find(m, i + 1) + if pos != -1 and pos < j: + j = pos + out.append(raw[i:j]) + i = j + return out - def parse_callout(self, m: Match[str]) -> None: - style = m.group(self.CALLOUT_STYLE_GROUP) - self._parse_nested(ZendeskHelpCallout(style), m) - def parse_steps(self, m: Match[str]) -> None: - self._parse_nested(ZendeskHelpSteps(), m) +def _format_title(title: str) -> str: + if title is None: + return '' + escaped = title.replace('"', '\\"') + return f' "{escaped}"' - def parse_tabs(self, m: Match[str]) -> None: - self._parse_nested(ZendeskHelpTabs(), m) - def _parse_nested(self, node: Node, m: Match[str]) -> None: - nested_content = m.group(self.TAG_CONTENT_GROUP) - nested_nodes = self.get_lexer().parse(nested_content) - node.add_nodes(nested_nodes) - self.tokens.append(node) +def _unquote_url_if_template(url: str) -> str: + """Undo Mistune's percent-encoding for template-like URLs. + + Mistune percent-encodes some characters in URLs (e.g. `{{url}}` becomes `%7B%7Burl%7D%7D`). + For structural diffs we don't care about URL contents, but we do want rendered markup to remain + readable and close to the original input. + """ + if not url or '%' not in url: + return url + unquoted = unquote(url) + if unquoted != url and ('{' in unquoted or '}' in unquoted): + return unquoted + return url + + +def _format_link_markup(text: str, url: str, title: str | None) -> str: + return f'[{text}]({url}{_format_title(title)})' + + +def _format_image_markup(alt: str, url: str, title: str | None) -> str: + return f'![{alt}]({url}{_format_title(title)})' + + +def _is_block_html(raw: str) -> bool: + stripped = raw.lstrip() + if stripped.startswith('")) + self.assertFalse(parser._is_block_html("text")) + self.assertTrue(parser._is_block_html("
text
")) + self.assertFalse(parser._is_block_html("nope")) + + def test_normalize_block_indentation(self): + # Only non-HTML lines should be considered for min-indent normalization. + raw = "
\n x\n
\n y" + normalized = parser._normalize_block_indentation(raw) + self.assertIn("y", normalized) + + def test_extract_reference_definitions_fence_special_case(self): + raw = "[id]: https://example.com\n```\n\n```" + text, defs = parser._extract_reference_definitions(raw) + self.assertEqual(1, len(defs)) + # The special-case inserts a blank line after the placeholder. + self.assertTrue(text.startswith("SDIFF_REF_DEF_0\n\n")) + + def test_extract_reference_definitions_fence_special_case_not_triggered_without_blank_line(self): + raw = "[id]: https://example.com\n```\n```" + text, defs = parser._extract_reference_definitions(raw) + self.assertEqual(1, len(defs)) + self.assertEqual("SDIFF_REF_DEF_0\n```\n```", text) + + def test_is_inside_fenced_block(self): + raw = "```\ncode\n```\noutside" + # Offset inside "code". + self.assertTrue(parser._is_inside_fenced_block(raw, raw.index("code"))) + # Offset inside "outside". + self.assertFalse(parser._is_inside_fenced_block(raw, raw.index("outside"))) + # Offset past end => fall through. + self.assertFalse(parser._is_inside_fenced_block(raw, len(raw) + 1)) + + def test_is_inside_list_block(self): + raw = "- a\n b\n\nc" + self.assertTrue(parser._is_inside_list_block(raw, raw.index("b"))) + self.assertFalse(parser._is_inside_list_block(raw, raw.index("c"))) + # Offset past end => fall through. + self.assertFalse(parser._is_inside_list_block(raw, len(raw) + 1)) + + def test_normalize_consecutive_fence_lines(self): + raw = "```\n```\ntext" + normalized = parser._normalize_consecutive_fence_lines(raw) + self.assertIn("```\n\n```", normalized) + + def test_normalize_consecutive_blockquote_lines(self): + raw = "> a\n> b\nc" + normalized = parser._normalize_consecutive_blockquote_lines(raw) + self.assertIn("> a\n\n> b", normalized) + + def test_normalize_fence_only_lines_start_new_paragraphs(self): + raw = "a\n```\nb" + normalized = parser._normalize_fence_only_lines_start_new_paragraphs(raw) + self.assertIn("a\n\n```", normalized) + # Blank line resets state. + normalized = parser._normalize_fence_only_lines_start_new_paragraphs("a\n\n```\n\n```") + self.assertIn("\n\n```\n\n```", normalized) + + def test_normalize_double_blank_line_list_nesting_does_not_overindent(self): + raw = "* a\n\n\n * b\n" + normalized = parser._normalize_double_blank_line_list_nesting(raw) + self.assertEqual(raw, normalized) + + def test_merge_adjacent_lists(self): + l1 = List(False, [ListItem([Text("a")])]) + l2 = List(True, [ListItem([Text("b")])]) + root = Root([l1, l2]) + merged = parser._merge_adjacent_lists(root.nodes) + self.assertEqual(1, len(merged)) + self.assertEqual(2, len(merged[0].nodes)) + + def test_parse_passthrough_when_parser_returns_non_list(self): + class _Dummy(MdParser): + def parse(self, text, rules=None): # noqa: ANN001 + return Root([Text("x")]) + + parsed = parser.parse("x", parser_cls=_Dummy) + self.assertIsInstance(parsed, Root) + + +class TestCoverageParserConversions(TestCase): + def setUp(self) -> None: + super().setUp() + self.p = MdParser() + + def test_convert_block_token_branches(self): + item = self.p._convert_block_token( + { + "type": "list_item", + "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}], + } + )[0] + self.assertEqual("list-item", item.name) + + block_text = self.p._convert_block_token({"type": "block_text", "children": [{"type": "text", "raw": "x"}]})[0] + self.assertEqual("paragraph", block_text.name) + + quote = self.p._convert_block_token( + {"type": "block_quote", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "q"}]}]} + )[0] + self.assertEqual("paragraph", quote.name) + self.assertIn(">", quote.nodes[0].text) + + code = self.p._convert_block_token({"type": "block_code", "raw": "code\n", "marker": "```"})[0] + self.assertTrue(code.nodes[0].text.startswith("```")) + + def test_convert_list_ordered_attr_fallback(self): + lst = self.p._convert_list({"type": "list", "attrs": {"ordered": True}, "children": []}) + self.assertTrue(lst.ordered) + + def test_convert_block_html_with_suffix(self): + token = {"type": "block_html", "raw": "
hi
\n\ntext"} + nodes = self.p._convert_block_html(token) + self.assertEqual("html", nodes[0].name) + self.assertEqual("paragraph", nodes[1].name) + + # Split happens, but suffix is whitespace-only => no extra nodes. + token = {"type": "block_html", "raw": "
hi
\n\n "} + nodes = self.p._convert_block_html(token) + self.assertEqual(1, len(nodes)) + + # Whitespace-only raw => empty conversion. + self.assertEqual([], self.p._convert_block_html({"type": "block_html", "raw": " "})) + + def test_convert_passthrough_block_children_and_raw(self): + out = self.p._convert_passthrough_block( + {"type": "unknown", "children": [{"type": "paragraph", "children": [{"type": "text", "raw": "x"}]}]} + ) + self.assertEqual("paragraph", out[0].name) + out2 = self.p._convert_passthrough_block({"type": "unknown", "raw": "raw"}) + self.assertEqual("paragraph", out2[0].name) + + def test_convert_block_quote_early_returns(self): + self.assertEqual([], self.p._convert_block_quote({"type": "block_quote", "children": []})) + self.assertEqual( + [], + self.p._convert_block_quote({"type": "block_quote", "children": [{"type": "paragraph", "children": []}]}), + ) + + def test_render_inline_children_unknown_child_type(self): + out = self.p._render_inline_children([{"type": "thematic_break", "raw": "---"}]) + self.assertEqual("---", out) + + def test_inline_other_and_codespan_text_fallback(self): + tokens = [{"type": "codespan", "text": "x"}, {"type": "unknown", "raw": ""}] + out = self.p._convert_inline_tokens(tokens) + self.assertEqual("`x`<x>", "".join(node.text for node in out)) + + def test_inline_marker_without_children_and_inline_other_with_children(self): + out = self.p._convert_inline_tokens([{"type": "strong", "children": []}]) + self.assertEqual(["text", "text"], [n.name for n in out]) + + out = self.p._convert_inline_tokens([{"type": "unknown", "children": [{"type": "text", "raw": "x"}]}]) + self.assertEqual("x", out[0].text) + + out = self.p._convert_inline_tokens([{"type": "unknown", "raw": " "}]) + self.assertEqual([], out) + + def test_flatten_inline_text_unknown_branches(self): + text = self.p._flatten_inline_text( + [ + {"type": "codespan", "raw": "x"}, + {"type": "unknown", "children": [{"type": "text", "raw": "y"}]}, + {"type": "unknown", "raw": "z"}, + ] + ) + self.assertIn("`x`", text) + self.assertTrue(text.endswith("z")) + + def test_flatten_inline_markup_link_and_image(self): + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "link", "children": [{"type": "text", "raw": "L"}], "attrs": {"url": "%7B%7Burl%7D%7D"}}, + {"type": "softbreak"}, + {"type": "image", "children": [{"type": "text", "raw": "A"}], "attrs": {"url": "u", "title": 't"'}}, + ] + s = self.p._flatten_inline_markup(tokens, softbreak_as_newline=True) + self.assertIn("[L]({{url}})", s) + self.assertIn('![A](u "t\\"")', s) + + def test_flatten_inline_markup_unknown_branches(self): + tokens = [ + {"type": "unknown", "children": [{"type": "text", "raw": "x"}]}, + {"type": "unknown", "raw": "y"}, + ] + s = self.p._flatten_inline_markup(tokens) + self.assertEqual("xy", s) + + def test_convert_list_block_nodes_ref_heading_and_text(self): + self.p._set_reference_definitions( + { + "SDIFF_REF_DEF_0": "[id]: https://example.com", + "[id]: https://example.com": "[id]: https://example.com", + } + ) + tokens = [ + {"type": "text", "raw": "SDIFF_REF_DEF_0"}, + {"type": "softbreak"}, + {"type": "text", "raw": "###header"}, + {"type": "softbreak"}, + {"type": "text", "raw": " "}, + {"type": "softbreak"}, + {"type": "text", "raw": "plain"}, + ] + nodes = self.p._convert_list_block_nodes(tokens) + self.assertEqual(["text", "header", "text"], [n.name for n in nodes]) + + def test_convert_list_block_nodes_empty(self): + self.assertEqual([], self.p._convert_list_block_nodes([])) + + def test_heading_from_inline_fallback_branch(self): + class _NoHeading(MdParser): + def __init__(self): + super().__init__() + self._markdown = lambda _: [{"type": "paragraph", "children": []}] # noqa: E731 + + p = _NoHeading() + heading = p._heading_from_inline([{"type": "text", "raw": "###header"}]) + self.assertEqual("header", heading.name) + self.assertEqual("text", heading.nodes[0].name) + + def test_convert_paragraph_or_heading_ref_and_heading(self): + self.p._set_reference_definitions({"SDIFF_REF_DEF_0": "[id]: https://example.com"}) + node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "SDIFF_REF_DEF_0"}]) + self.assertEqual("paragraph", node.name) + + node = self.p._convert_paragraph_or_heading([{"type": "text", "raw": "###header"}]) + self.assertEqual("header", node.name) + + node = self.p._convert_paragraph_token([{"type": "text", "raw": "###header"}])[0] + self.assertEqual("header", node.name) + + def test_split_paragraph_inline_on_fence_variants(self): + self.assertIsNone(self.p._split_paragraph_inline_on_fence([])) + self.assertIsNone(self.p._split_paragraph_inline_on_fence([{"type": "text", "raw": "x"}])) + + # First line is a fence-only marker => do not split. + tokens = [{"type": "text", "raw": "```"}, {"type": "softbreak"}, {"type": "text", "raw": "x"}] + self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens)) + + # Tail is fence markers but not a complete fence block => do not split. + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + ] + self.assertIsNone(self.p._split_paragraph_inline_on_fence(tokens)) + + # Complete fence block tail => split. + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + {"type": "softbreak"}, + {"type": "text", "raw": "code"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + ] + parts = self.p._split_paragraph_inline_on_fence(tokens) + self.assertEqual(2, len(parts)) + + nodes = self.p._convert_paragraph_token(tokens) + self.assertEqual(2, len(nodes)) + + def test_split_paragraph_inline_on_fence_first_part_includes_seps(self): + tokens = [ + {"type": "text", "raw": "a"}, + {"type": "softbreak"}, + {"type": "text", "raw": "b"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + {"type": "softbreak"}, + {"type": "text", "raw": "code"}, + {"type": "softbreak"}, + {"type": "text", "raw": "```"}, + ] + parts = self.p._split_paragraph_inline_on_fence(tokens) + self.assertEqual(2, len(parts)) + + def test_convert_list_item_block_html_text_smoke(self): + # Exercise conversion of text following a (hypothetical) HTML block inside a list item. + nodes = self.p._convert_list_item_block_html_text("text\n\n# h\n\n- a\n") + self.assertTrue(any(n.name == "header" for n in nodes)) + self.assertTrue(any(n.name == "list" for n in nodes)) + + def test_convert_list_item_with_block_html_child(self): + token = { + "type": "list_item", + "children": [ + {"type": "block_html", "raw": "
hi
"}, + ], + } + item = self.p._convert_list_item(token) + self.assertTrue(item.nodes) + + def test_convert_list_item_block_html_variants(self): + self.assertEqual([], self.p._convert_list_item_block_html({"type": "block_html", "raw": " "})) + + nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "not html\n"}) + self.assertTrue(nodes) + + nodes = self.p._convert_list_item_block_html({"type": "block_html", "raw": "
hi
\n\n "}) + self.assertTrue(nodes) + + def test_convert_list_item_block_html_text_with_block_html_and_raw(self): + nodes = self.p._convert_list_item_block_html_text("
hi
\n\n---\n") + self.assertTrue(any(n.name == "text" for n in nodes)) + + def test_convert_list_item_block_html_smoke(self): + token = {"type": "block_html", "raw": "
hi
\n\ntext"} + nodes = self.p._convert_list_item_block_html(token) + self.assertTrue(any(isinstance(n, Text) for n in nodes)) + + def test_rendering_roundtrip_smoke(self): + md = "some text [link](url) new text" + tree = parser.parse(md, parser_cls=MdParser) + self.assertEqual(md, TextRenderer().render(tree)) diff --git a/tests/test_parser.py b/tests/test_parser.py index 498c070..965e8f5 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,6 +1,7 @@ from unittest import TestCase from sdiff import parser, MdParser, ZendeskHelpMdParser -from sdiff.model import ZendeskHelpSteps +from sdiff.model import Paragraph, Root, Text, ZendeskHelpSteps +from sdiff.renderer import TextRenderer class ParserTestCase(TestCase): @@ -28,9 +29,13 @@ def test_header_in_list(self): def test_link(self): self._run_and_assert('[link](url)', 'pa') + actual = self._parse('[link](url)') + self.assertEqual('[link](url)', actual.nodes[0].nodes[0].text) def test_image(self): self._run_and_assert('![Alt text][url/to/image]', 'pi') + actual = self._parse('![Alt text][url/to/image]') + self.assertEqual('![Alt text][url/to/image]', actual.nodes[0].nodes[0].text) def test_broken_link_space(self): self._run_and_assert('[link] (http://www.google.com)', 'pt') @@ -65,9 +70,154 @@ def test_heading_text(self): actual = self._parse('### heading') self.assertEqual('heading', actual.nodes[0].nodes[0].text) + def test_heading_without_space_followed_by_text_parses_as_header(self): + actual = self._parse('##Heading\ntext') + self.assertEqual('2tpt', actual.print_all()) + + def test_heading_without_space_with_link_parses_as_header(self): + actual = self._parse('##[Verify email]({{url}})\ntext') + self.assertEqual('header', actual.nodes[0].name) + self.assertEqual(2, actual.nodes[0].level) + self.assertEqual('link', actual.nodes[0].nodes[0].name) + self.assertEqual('[Verify email]({{url}})', actual.nodes[0].nodes[0].text) + + def test_heading_without_space_in_list_item_followed_by_text(self): + actual = self._parse('1. ##Heading\n text') + self.assertEqual('lm2tt', actual.print_all()) + def test_link_wrapped_in_text(self): self._run_and_assert('some text [link](url) new text', 'ptat') + def test_link_with_trailing_text_does_not_duplicate_buffer(self): + actual = self._parse('some text [link](url) new text') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'link', 'text'], [node.name for node in paragraph.nodes]) + self.assertEqual('some text ', paragraph.nodes[0].text) + self.assertEqual('[link](url)', paragraph.nodes[1].text) + self.assertEqual(' new text', paragraph.nodes[2].text) + + def test_image_with_trailing_text_does_not_duplicate_buffer(self): + actual = self._parse('some ![alt](url) new') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'image', 'text'], [node.name for node in paragraph.nodes]) + self.assertEqual('some ', paragraph.nodes[0].text) + self.assertEqual('![alt](url)', paragraph.nodes[1].text) + self.assertEqual(' new', paragraph.nodes[2].text) + + def test_inline_marker_does_not_duplicate_buffer(self): + actual = self._parse('some **bold** text') + self.assertEqual('some **bold** text', TextRenderer().render(actual)) + + def test_inline_linebreak_does_not_duplicate_buffer(self): + actual = self._parse('a\\\nb') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'new-line', 'text'], [node.name for node in paragraph.nodes]) + self.assertEqual('a', paragraph.nodes[0].text) + self.assertEqual('b', paragraph.nodes[2].text) + + def test_text_before_link_not_duplicated(self): + actual = self._parse('some text and [link](url)') + paragraph = actual.nodes[0] + self.assertEqual(['text', 'link'], [node.name for node in paragraph.nodes]) + self.assertEqual(['some text and '], [node.text for node in paragraph.nodes if node.name == 'text']) + + def test_link_label_with_codespan(self): + actual = self._parse('[use `foo`](url)') + self.assertEqual('[use `foo`](url)', actual.nodes[0].nodes[0].text) + + def test_link_label_with_strong_preserves_markers(self): + actual = self._parse('[**bold**](url)') + self.assertEqual('[**bold**](url)', actual.nodes[0].nodes[0].text) + + def test_link_title_preserved(self): + actual = self._parse('[label](https://example.com "Title Here")') + self.assertEqual('[label](https://example.com "Title Here")', actual.nodes[0].nodes[0].text) + + def test_image_title_preserved(self): + actual = self._parse('![alt](https://img "Img Title")') + self.assertEqual('![alt](https://img "Img Title")', actual.nodes[0].nodes[0].text) + + def test_reference_definition_preserved(self): + data = 'See [API][id].\n\n[id]: https://example.com' + tree = self._parse(data) + link = next(node for node in tree.nodes[0].nodes if node.name == 'link') + self.assertEqual('[API][id]', link.text) + self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text) + + def test_reference_definition_inside_list_item_preserved(self): + data = '- item\n [id]: https://example.com' + tree = self._parse(data) + list_item = tree.nodes[0].nodes[0] + self.assertEqual('item', list_item.nodes[0].text) + self.assertEqual('[id]: https://example.com', tree.nodes[1].nodes[0].text) + + def test_reference_links_with_whitespace_and_empty_id(self): + data = 'See [API][] and [Ref] [id].\n\n[API]: https://example.com\n[id]: https://example.com' + tree = self._parse(data) + link_texts = [node.text for node in tree.nodes[0].nodes if node.name == 'link'] + self.assertIn('[API][]', link_texts) + self.assertIn('[Ref] [id]', link_texts) + + def test_reference_definition_inside_fence_is_text(self): + data = """``` +[id]: https://example.com +[link][id] +```""" + tree = self._parse(data) + self.assertEqual('ptttptattt', tree.print_all()) + + def test_reference_definition_inside_long_fence_is_text(self): + data = """```` +[id]: https://example.com +[link][id] +````""" + tree = self._parse(data) + self.assertEqual('pttttptatttt', tree.print_all()) + + def test_softbreak_preserves_space(self): + actual = self._parse('hello\nworld') + self.assertEqual('hello world', actual.nodes[0].nodes[0].text) + + def test_block_quote_preserves_marker(self): + actual = self._parse('> quote') + self.assertEqual('> quote', actual.nodes[0].nodes[0].text) + + def test_fenced_code_preserves_fences(self): + actual = self._parse('```\ncode\n```') + self.assertEqual('ptttttt', actual.print_all()) + text = ''.join(node.text for node in actual.nodes[0].nodes) + self.assertTrue(text.startswith('```')) + self.assertTrue(text.endswith('```')) + + def test_ordered_list_parses_as_ordered(self): + tree = self._parse('1. one\n2. two') + list_node = tree.nodes[0] + self.assertTrue(list_node.ordered) + + def test_ordered_list_marker_other_than_1_interrupts_paragraph(self): + self._run_and_assert('para\n2. item\n', 'ptlmt') + + def test_list_item_allows_unindented_heading_lazy_continuation(self): + tree = self._parse('* a\n###### b\n') + self.assertEqual(1, len(tree.nodes)) + self.assertEqual('list', tree.nodes[0].name) + item = tree.nodes[0].nodes[0] + self.assertEqual(['text', 'header'], [node.name for node in item.nodes]) + self.assertEqual('a', item.nodes[0].text) + self.assertEqual(6, item.nodes[1].level) + self.assertEqual('b', item.nodes[1].nodes[0].text) + + def test_unordered_list_parses_as_unordered(self): + tree = self._parse('- one\n- two') + list_node = tree.nodes[0] + self.assertFalse(list_node.ordered) + + def test_double_blank_lines_between_list_items_nests_next_list(self): + self._run_and_assert('* a\n\n\n* b\n', 'lmtlmt') + + def test_double_blank_lines_between_ordered_list_items_nests_next_list(self): + self._run_and_assert('1. a\n\n\n1. b\n', 'lmtlmt') + class TestZendeskParser(ParserTestCase): def setUp(self) -> None: @@ -103,6 +253,22 @@ def test_callout_invalid_style(self): actual = self._parse(fixture) self.assertNotEqual(actual.nodes[0].name, 'callout') + def test_callout_invalid_style_does_not_swallow_trailing_closing_tag(self): + fixture = '\n# title\ncontent\n\n\n' + self._run_and_assert(fixture, 'xpt') + + def test_callout_tags_inside_list_item_are_text_and_allow_headings(self): + fixture = '1. item\n\n# title\ncontent\n\n' + tree = self._parse(fixture) + self.assertEqual(1, len(tree.nodes)) + self.assertEqual('list', tree.nodes[0].name) + item = tree.nodes[0].nodes[0] + self.assertEqual(['text', 'text', 'header', 'text', 'text'], [node.name for node in item.nodes]) + self.assertEqual('<callout>', item.nodes[1].text) + self.assertEqual(1, item.nodes[2].level) + self.assertEqual('title', item.nodes[2].nodes[0].text) + self.assertEqual('</callout>', item.nodes[-1].text) + def test_tabs(self): fixture = """ @@ -114,6 +280,48 @@ def test_tabs(self): """ self._run_and_assert(fixture, 'T1tpt1tpt') + def test_inline_callout_is_not_structural(self): + fixture = """intro +# title +content + outro""" + self._run_and_assert(fixture, 'pt1tpt') + + def test_zendesk_tags_inside_fenced_code_are_text(self): + fixture = """``` + +# title +content + + +1. one + + +# tab +content + +```""" + tree = self._parse(fixture) + self.assertEqual('ptttxxxpttt', tree.print_all()) + self.assertFalse(any(node.name in {'callout', 'steps', 'tabs'} for node in tree.nodes)) + + def test_zendesk_tags_after_fenced_code_are_parsed(self): + fixture = """``` + +# title +content + +``` + + +# title +content + +""" + tree = self._parse(fixture) + self.assertTrue(any(node.name == 'callout' for node in tree.nodes)) + self.assertEqual(1, tree.print_all().count('C')) + def test_steps(self): steps_fixture = """ @@ -166,3 +374,33 @@ def test_leave_spaces_with_text(self): text = 'test \n test' actual = parser._remove_spaces_from_empty_lines(text) self.assertEqual(text, actual) + + def test_remove_ltr_rtl_marks(self): + text = 'a\u200eb\u200f' + actual = parser._remove_ltr_rtl_marks(text) + self.assertEqual('ab', actual) + + +class DummyParser: + last_text = None + + def parse(self, text, rules=None): + DummyParser.last_text = text + return [Paragraph([Text(text)])] + + +class TestParseWrapper(TestCase): + def test_wraps_list_parser_output(self): + tree = parser.parse('hello', parser_cls=DummyParser) + self.assertIsInstance(tree, Root) + self.assertEqual('pt', tree.print_all()) + + def test_custom_parser_input_not_mutated_by_ref_defs(self): + data = 'See [API][id].\n\n[id]: https://example.com' + parser.parse(data, parser_cls=DummyParser) + self.assertIn('[id]: https://example.com', DummyParser.last_text) + + def test_mdparser_parse_accepts_rules_argument(self): + md_parser = MdParser() + nodes = md_parser.parse('1. one', MdParser.list_rules) + self.assertIsInstance(nodes, list) diff --git a/tests/test_sdiff.py b/tests/test_sdiff.py index a132509..55d3079 100644 --- a/tests/test_sdiff.py +++ b/tests/test_sdiff.py @@ -8,7 +8,7 @@ def _load_fixture(*path): - return open(os.path.join('tests/fixtures', *path)).read() + return open(os.path.join('tests/fixtures', *path), encoding='utf-8').read() def _read_test_files(dirpath): @@ -36,3 +36,76 @@ def test_different(self): _, _, errors = sdiff.diff(_load_fixture('different', path1), _load_fixture('different', path2), parser_cls=ZendeskHelpMdParser) self.assertNotEqual([], errors, msg=case) + + def test_ignores_link_content(self): + left = '[Link](http://example.com)' + right = '[Different](http://example.org)' + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + + def test_missing_link_is_reported(self): + left = 'text [Link](http://example.com)' + right = 'text' + tree1 = sdiff.parse(left) + tree2 = sdiff.parse(right) + _, _, errors = sdiff.diff_links(tree1, tree2) + self.assertTrue(any(error.node.name == 'link' for error in errors)) + + def test_extra_paragraph_has_paragraph_error(self): + left = _load_fixture('different', 'extra_paragraph.en.md') + right = _load_fixture('different', 'extra_paragraph.de.md') + _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser) + self.assertTrue(any(error.node.name == 'paragraph' for error in errors)) + + def test_softbreaks_ignored_in_structure(self): + left = 'hello\nworld' + right = 'hello world' + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + + def test_heading_without_space_matches_heading_with_space(self): + left = '##Heading\ntext' + right = '## Heading\ntext' + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + + def test_list_heading_without_space_matches_heading_with_space(self): + left = '1. ##Heading\n text' + right = '1. ## Heading\n text' + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + + def test_reference_definition_missing_is_reported(self): + left = 'See [API][id].\n\n[id]: https://example.com' + right = 'See [API][id].' + _, _, errors = sdiff.diff(left, right) + self.assertTrue(any(error.node.name == 'paragraph' for error in errors)) + + def test_code_block_content_ignored_in_structure(self): + left = """``` +code sample +```""" + right = """``` +different code sample +```""" + _, _, errors = sdiff.diff(left, right) + self.assertEqual([], errors) + + def test_invalid_callout_followed_by_fence_does_not_depend_on_blank_line(self): + left = """ +# title +content + + +``` +code +```""" + right = """ +# title +content + +``` +code +```""" + _, _, errors = sdiff.diff(left, right, parser_cls=ZendeskHelpMdParser) + self.assertEqual([], errors) diff --git a/tests/test_tree_utils.py b/tests/test_tree_utils.py index ff8a226..dc4f0ab 100644 --- a/tests/test_tree_utils.py +++ b/tests/test_tree_utils.py @@ -1,7 +1,62 @@ from unittest import TestCase +from sdiff.model import Header, Link, Paragraph, Root, Text +from sdiff.tree_utils import traverse + class TestTraverse(TestCase): - def test_name(self): - pass + def test_preorder_traversal(self): + tree = Root([ + Paragraph([ + Text('one'), + Link('link'), + ]), + Header(2, [ + Text('heading'), + ]), + ]) + symbols = [node.symbol for node in traverse(tree)] + self.assertEqual(['p', 't', 'a', 'h', 't'], symbols) + + def test_consecutive_text_nodes_coalesced(self): + tree = Root([ + Paragraph([ + Text('one'), + Text('two'), + Link('link'), + Text('three'), + Text('four'), + ]), + ]) + texts = [node.text for node in traverse(tree) if isinstance(node, Text)] + self.assertEqual(['one', 'three'], texts) + + def test_exclude_symbols_prunes_children(self): + tree = Root([ + Paragraph([ + Text('one'), + Link('link'), + ]), + ]) + symbols = [node.symbol for node in traverse(tree, exclude_symbols=['a'])] + self.assertEqual(['p', 't'], symbols) + + def test_include_symbols_filters_children(self): + tree = Root([ + Paragraph([ + Text('one'), + Link('link'), + ]), + ]) + symbols = [node.symbol for node in traverse(tree, include_symbols=['a'])] + self.assertEqual(['p', 'a'], symbols) + + def test_include_exclude_conflict_excludes(self): + tree = Root([ + Paragraph([ + Link('link'), + ]), + ]) + symbols = [node.symbol for node in traverse(tree, include_symbols=['a'], exclude_symbols=['a'])] + self.assertEqual(['p'], symbols)