From 2de9978da63383e9dfdf3c3279b81a7d4c5b2425 Mon Sep 17 00:00:00 2001 From: JARVIS-coding-Agent Date: Tue, 21 Apr 2026 13:07:23 +0000 Subject: [PATCH 1/6] feat(markdown): add table conversion pipeline with pulldown-cmark MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Re-introduce the markdown table conversion feature from PR #180, which was reverted in PR #284 due to accidentally deleted CI workflows during rebase. This is a clean re-implementation adapted to the current v0.8.0 codebase: - Add src/markdown.rs: pulldown-cmark AST parser with three rendering modes (code/bullets/off) for table detection and conversion - Add MarkdownConfig to config.rs with serde(default) for backward compatibility — existing configs without [markdown] section default to code mode - Integrate convert_tables() in AdapterRouter.stream_prompt() so both Discord and Slack adapters benefit (was discord.rs-only in #180) - Make split_message() code-fence-aware: auto-close/reopen fences across chunk boundaries so code blocks render correctly - Add pulldown-cmark 0.13 dependency Closes #178 Supersedes #180 --- Cargo.toml | 1 + config.toml.example | 6 + src/adapter.rs | 7 +- src/config.rs | 17 +++ src/format.rs | 22 +++ src/main.rs | 3 +- src/markdown.rs | 352 ++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 406 insertions(+), 2 deletions(-) create mode 100644 src/markdown.rs diff --git a/Cargo.toml b/Cargo.toml index 391ffd91..a8b2f48a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,6 +24,7 @@ reqwest = { version = "0.12", default-features = false, features = ["rustls-tls" base64 = "0.22" image = { version = "0.25", default-features = false, features = ["jpeg", "png", "gif", "webp"] } unicode-width = "0.2" +pulldown-cmark = "0.13" tokio-tungstenite = { version = "0.21", features = ["rustls-tls-webpki-roots"] } [target.'cfg(unix)'.dependencies] diff --git a/config.toml.example b/config.toml.example index ef3d7201..17b01d1b 100644 --- a/config.toml.example +++ b/config.toml.example @@ -79,6 +79,12 @@ working_dir = "/home/agent" max_sessions = 10 session_ttl_hours = 24 +[markdown] +tables = "code" # "code" (default) | "bullets" | "off" + # code: wrap tables in fenced code blocks with aligned columns + # bullets: convert each row into bullet points (• Header: Value) + # off: pass through unchanged + [reactions] enabled = true remove_after_reply = false diff --git a/src/adapter.rs b/src/adapter.rs index 0b5b2c1a..c61a6a30 100644 --- a/src/adapter.rs +++ b/src/adapter.rs @@ -8,6 +8,7 @@ use crate::acp::{classify_notification, AcpEvent, ContentBlock, SessionPool}; use crate::config::ReactionsConfig; use crate::error_display::{format_coded_error, format_user_error}; use crate::format; +use crate::markdown::{self, TableMode}; use crate::reactions::StatusReactionController; // --- Platform-agnostic types --- @@ -97,13 +98,15 @@ pub trait ChatAdapter: Send + Sync + 'static { pub struct AdapterRouter { pool: Arc, reactions_config: ReactionsConfig, + table_mode: TableMode, } impl AdapterRouter { - pub fn new(pool: Arc, reactions_config: ReactionsConfig) -> Self { + pub fn new(pool: Arc, reactions_config: ReactionsConfig, table_mode: TableMode) -> Self { Self { pool, reactions_config, + table_mode, } } @@ -229,6 +232,7 @@ impl AdapterRouter { let thread_channel = thread_channel.clone(); let message_limit = adapter.message_limit(); let streaming = adapter.use_streaming(other_bot_present); + let table_mode = self.table_mode; self.pool .with_connection(thread_key, |conn| { @@ -380,6 +384,7 @@ impl AdapterRouter { final_content }; + let final_content = markdown::convert_tables(&final_content, table_mode); let chunks = format::split_message(&final_content, message_limit); if let Some(msg) = placeholder_msg { // Streaming: edit first chunk into placeholder, send rest as new messages diff --git a/src/config.rs b/src/config.rs index bf6a20a6..352b68b3 100644 --- a/src/config.rs +++ b/src/config.rs @@ -1,3 +1,4 @@ +use crate::markdown::TableMode; use regex::Regex; use serde::Deserialize; use std::collections::HashMap; @@ -43,6 +44,8 @@ pub struct Config { pub reactions: ReactionsConfig, #[serde(default)] pub stt: SttConfig, + #[serde(default)] + pub markdown: MarkdownConfig, } #[derive(Debug, Clone, Deserialize)] @@ -297,6 +300,20 @@ impl Default for ReactionTiming { } } +// --- markdown --- + +#[derive(Debug, Clone, Deserialize)] +pub struct MarkdownConfig { + #[serde(default)] + pub tables: TableMode, +} + +impl Default for MarkdownConfig { + fn default() -> Self { + Self { tables: TableMode::default() } + } +} + // --- loading --- /// Resolve an allow_all flag: if explicitly set, use it; otherwise infer from the list. diff --git a/src/format.rs b/src/format.rs index fbf76c67..693f6ea8 100644 --- a/src/format.rs +++ b/src/format.rs @@ -1,5 +1,9 @@ /// Split text into chunks at line boundaries, each <= limit Unicode characters (UTF-8 safe). /// Discord's message limit counts Unicode characters, not bytes. +/// +/// Fenced code blocks (``` ... ```) are handled specially: if a split falls inside a +/// code block, the current chunk is closed with ``` and the next chunk is reopened with +/// ```, so each chunk renders correctly in Discord. pub fn split_message(text: &str, limit: usize) -> Vec { if text.chars().count() <= limit { return vec![text.to_string()]; @@ -8,19 +12,37 @@ pub fn split_message(text: &str, limit: usize) -> Vec { let mut chunks = Vec::new(); let mut current = String::new(); let mut current_len: usize = 0; + let mut in_code_fence = false; for line in text.split('\n') { let line_chars = line.chars().count(); + let is_fence_marker = line.starts_with("```"); + // +1 for the newline if !current.is_empty() && current_len + line_chars + 1 > limit { + if in_code_fence && !is_fence_marker { + // Close the open code fence so this chunk renders correctly. + current.push_str("\n```"); + } chunks.push(current); current = String::new(); current_len = 0; + if in_code_fence && !is_fence_marker { + // Reopen the code fence in the new chunk. + current.push_str("```"); + current_len = 3; + } } + if !current.is_empty() { current.push('\n'); current_len += 1; } + + if is_fence_marker { + in_code_fence = !in_code_fence; + } + // If a single line exceeds limit, hard-split on char boundaries if line_chars > limit { for ch in line.chars() { diff --git a/src/main.rs b/src/main.rs index b8163360..df7668f3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -5,6 +5,7 @@ mod config; mod discord; mod error_display; mod format; +mod markdown; mod media; mod reactions; mod setup; @@ -107,7 +108,7 @@ async fn main() -> anyhow::Result<()> { info!(model = %cfg.stt.model, base_url = %cfg.stt.base_url, "STT enabled"); } - let router = Arc::new(AdapterRouter::new(pool.clone(), cfg.reactions)); + let router = Arc::new(AdapterRouter::new(pool.clone(), cfg.reactions, cfg.markdown.tables)); // Shutdown signal for Slack adapter let (shutdown_tx, shutdown_rx) = tokio::sync::watch::channel(false); diff --git a/src/markdown.rs b/src/markdown.rs new file mode 100644 index 00000000..29d97132 --- /dev/null +++ b/src/markdown.rs @@ -0,0 +1,352 @@ +use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd}; +use serde::Deserialize; +use std::fmt; +use unicode_width::UnicodeWidthStr; + +/// How to render markdown tables for a given channel. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] +#[serde(rename_all = "lowercase")] +pub enum TableMode { + /// Wrap the table in a fenced code block (default). + Code, + /// Convert each row into bullet points. + Bullets, + /// Pass through unchanged. + Off, +} + +impl Default for TableMode { + fn default() -> Self { + Self::Code + } +} + +impl fmt::Display for TableMode { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Code => write!(f, "code"), + Self::Bullets => write!(f, "bullets"), + Self::Off => write!(f, "off"), + } + } +} + +// ── IR types ──────────────────────────────────────────────────────── + +/// A parsed table: header row + data rows, each cell is plain text. +struct Table { + headers: Vec, + rows: Vec>, +} + +/// Segment of the document — either verbatim text or a parsed table. +enum Segment { + Text(String), + Table(Table), +} + +// ── Public API ────────────────────────────────────────────────────── + +/// Parse markdown, detect tables via pulldown-cmark, and render them +/// according to `mode`. Non-table content passes through unchanged. +pub fn convert_tables(markdown: &str, mode: TableMode) -> String { + if mode == TableMode::Off || markdown.is_empty() { + return markdown.to_string(); + } + + let segments = parse_segments(markdown, mode); + + let mut out = String::with_capacity(markdown.len()); + for seg in segments { + match seg { + Segment::Text(t) => out.push_str(&t), + Segment::Table(table) => match mode { + TableMode::Code => render_table_code(&table, &mut out), + TableMode::Bullets => render_table_bullets(&table, &mut out), + TableMode::Off => unreachable!(), + }, + } + } + out +} + +// ── Parser ────────────────────────────────────────────────────────── + +/// Walk the markdown source with pulldown-cmark and split it into +/// text segments and parsed Table segments. +fn parse_segments(markdown: &str, mode: TableMode) -> Vec { + let mut opts = Options::empty(); + opts.insert(Options::ENABLE_TABLES); + + let mut segments: Vec = Vec::new(); + let mut in_table = false; + let mut in_head = false; + let mut headers: Vec = Vec::new(); + let mut rows: Vec> = Vec::new(); + let mut current_row: Vec = Vec::new(); + let mut cell_buf = String::new(); + let mut last_table_end: usize = 0; + + // We need byte offsets to grab non-table text verbatim. + let parser_with_offsets = Parser::new_ext(markdown, opts).into_offset_iter(); + + for (event, range) in parser_with_offsets { + match event { + Event::Start(Tag::Table(_)) => { + // Flush text before this table + let before = &markdown[last_table_end..range.start]; + if !before.is_empty() { + push_text(&mut segments, before); + } + in_table = true; + headers.clear(); + rows.clear(); + } + Event::End(TagEnd::Table) => { + let table = Table { + headers: std::mem::take(&mut headers), + rows: std::mem::take(&mut rows), + }; + segments.push(Segment::Table(table)); + in_table = false; + last_table_end = range.end; + } + Event::Start(Tag::TableHead) => { + in_head = true; + current_row.clear(); + } + Event::End(TagEnd::TableHead) => { + headers = std::mem::take(&mut current_row); + in_head = false; + } + Event::Start(Tag::TableRow) => { + current_row.clear(); + } + Event::End(TagEnd::TableRow) => { + if !in_head { + rows.push(std::mem::take(&mut current_row)); + } + } + Event::Start(Tag::TableCell) => { + cell_buf.clear(); + } + Event::End(TagEnd::TableCell) => { + current_row.push(cell_buf.trim().to_string()); + cell_buf.clear(); + } + Event::Text(t) if in_table => { + cell_buf.push_str(&t); + } + Event::Code(t) if in_table => { + // In Code mode the table is already inside a fenced code block, + // so backticks would render as literal characters. Strip them. + if mode != TableMode::Code { + cell_buf.push('`'); + } + cell_buf.push_str(&t); + if mode != TableMode::Code { + cell_buf.push('`'); + } + } + // Inline markup inside cells: collect text, ignore tags + Event::SoftBreak if in_table => { + cell_buf.push(' '); + } + Event::HardBreak if in_table => { + cell_buf.push(' '); + } + // Start/End of inline tags (bold, italic, link, etc.) — skip the + // tag markers but keep processing their child text events above. + Event::Start(Tag::Emphasis) + | Event::Start(Tag::Strong) + | Event::Start(Tag::Strikethrough) + | Event::Start(Tag::Link { .. }) + | Event::End(TagEnd::Emphasis) + | Event::End(TagEnd::Strong) + | Event::End(TagEnd::Strikethrough) + | Event::End(TagEnd::Link) + if in_table => {} + _ => {} + } + } + + // Remaining text after last table + if last_table_end < markdown.len() { + let tail = &markdown[last_table_end..]; + if !tail.is_empty() { + push_text(&mut segments, tail); + } + } + + segments +} + +fn push_text(segments: &mut Vec, text: &str) { + if let Some(Segment::Text(ref mut prev)) = segments.last_mut() { + prev.push_str(text); + } else { + segments.push(Segment::Text(text.to_string())); + } +} + +// ── Renderers ─────────────────────────────────────────────────────── + +/// Render table as a fenced code block with aligned columns. +fn render_table_code(table: &Table, out: &mut String) { + let col_count = table + .headers + .len() + .max(table.rows.iter().map(|r| r.len()).max().unwrap_or(0)); + if col_count == 0 { + return; + } + + // Compute column widths (using display width for CJK/emoji) + let mut widths = vec![0usize; col_count]; + for (i, h) in table.headers.iter().enumerate() { + widths[i] = widths[i].max(UnicodeWidthStr::width(h.as_str())); + } + for row in &table.rows { + for (i, cell) in row.iter().enumerate() { + if i < col_count { + widths[i] = widths[i].max(UnicodeWidthStr::width(cell.as_str())); + } + } + } + // Minimum width 3 for the divider + for w in &mut widths { + *w = (*w).max(3); + } + + out.push_str("```\n"); + + // Header row + write_row(out, &table.headers, &widths, col_count); + // Divider + out.push('|'); + for w in &widths { + out.push(' '); + for _ in 0..*w { + out.push('-'); + } + out.push_str(" |"); + } + out.push('\n'); + // Data rows + for row in &table.rows { + write_row(out, row, &widths, col_count); + } + + out.push_str("```\n"); +} + +fn write_row(out: &mut String, cells: &[String], widths: &[usize], col_count: usize) { + out.push('|'); + for i in 0..col_count { + out.push(' '); + let cell = cells.get(i).map(|s| s.as_str()).unwrap_or(""); + out.push_str(cell); + let display_width = UnicodeWidthStr::width(cell); + let pad = widths[i].saturating_sub(display_width); + for _ in 0..pad { + out.push(' '); + } + out.push_str(" |"); + } + out.push('\n'); +} + +/// Render table as bullet points: `• header: value` per cell. +fn render_table_bullets(table: &Table, out: &mut String) { + for (row_idx, row) in table.rows.iter().enumerate() { + for (i, cell) in row.iter().enumerate() { + if cell.is_empty() { + continue; + } + out.push_str("• "); + if let Some(h) = table.headers.get(i) { + if !h.is_empty() { + out.push_str(h); + out.push_str(": "); + } + } + out.push_str(cell); + out.push('\n'); + } + // Blank line between rows, but not after the last one + if row_idx + 1 < table.rows.len() { + out.push('\n'); + } + } +} + +// ── Tests ─────────────────────────────────────────────────────────── + +#[cfg(test)] +mod tests { + use super::*; + + const TABLE_MD: &str = "\ +Some text before. + +| Name | Age | +|-------|-----| +| Alice | 30 | +| Bob | 25 | + +Some text after. +"; + + #[test] + fn off_mode_passes_through() { + let result = convert_tables(TABLE_MD, TableMode::Off); + assert_eq!(result, TABLE_MD); + } + + #[test] + fn code_mode_wraps_in_codeblock() { + let result = convert_tables(TABLE_MD, TableMode::Code); + assert!(result.contains("```\n")); + assert!(result.contains("| Alice")); + assert!(result.contains("Some text before.")); + assert!(result.contains("Some text after.")); + } + + #[test] + fn bullets_mode_converts_to_bullets() { + let result = convert_tables(TABLE_MD, TableMode::Bullets); + assert!(result.contains("• Name: Alice")); + assert!(result.contains("• Age: 30")); + assert!(!result.contains("```")); + } + + #[test] + fn no_table_passes_through() { + let plain = "Hello world\nNo tables here."; + let result = convert_tables(plain, TableMode::Code); + assert_eq!(result, plain); + } + + #[test] + fn code_mode_strips_backticks_from_code_cells() { + let md = "| col |\n|-----|\n| `value` |\n"; + let result = convert_tables(md, TableMode::Code); + // The table is inside a ``` block — backtick wrapping must be stripped. + assert!(result.contains("value"), "cell content should be present"); + // Only the fence markers themselves should contain backticks. + let inner = result + .trim_start_matches("```\n") + .trim_end_matches("```\n"); + assert!( + !inner.contains('`'), + "no backticks should appear inside the code fence: {result:?}" + ); + } + + #[test] + fn bullets_mode_keeps_backticks_in_code_cells() { + let md = "| col |\n|-----|\n| `value` |\n"; + let result = convert_tables(md, TableMode::Bullets); + assert!(result.contains("`value`"), "backticks should be kept in bullets mode"); + } +} From cc2182e752d3490d9e62440192ed931f234751f5 Mon Sep 17 00:00:00 2001 From: JARVIS-coding-Agent Date: Tue, 21 Apr 2026 13:23:06 +0000 Subject: [PATCH 2/6] =?UTF-8?q?fix:=20address=20clippy=20lints=20=E2=80=94?= =?UTF-8?q?=20derive=20Default,=20match=20guard,=20iterator?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - TableMode: use #[derive(Default)] + #[default] attribute instead of manual impl Default - MarkdownConfig: use #[derive(Default)] instead of manual impl - parse_segments: collapse if-inside-match into match guard - write_row: use widths.iter().enumerate() instead of range loop --- src/config.rs | 8 +------- src/markdown.rs | 19 ++++++------------- 2 files changed, 7 insertions(+), 20 deletions(-) diff --git a/src/config.rs b/src/config.rs index 352b68b3..0732e576 100644 --- a/src/config.rs +++ b/src/config.rs @@ -302,18 +302,12 @@ impl Default for ReactionTiming { // --- markdown --- -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Default, Deserialize)] pub struct MarkdownConfig { #[serde(default)] pub tables: TableMode, } -impl Default for MarkdownConfig { - fn default() -> Self { - Self { tables: TableMode::default() } - } -} - // --- loading --- /// Resolve an allow_all flag: if explicitly set, use it; otherwise infer from the list. diff --git a/src/markdown.rs b/src/markdown.rs index 29d97132..7ed4bbf4 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -4,10 +4,11 @@ use std::fmt; use unicode_width::UnicodeWidthStr; /// How to render markdown tables for a given channel. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Deserialize)] +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Deserialize)] #[serde(rename_all = "lowercase")] pub enum TableMode { /// Wrap the table in a fenced code block (default). + #[default] Code, /// Convert each row into bullet points. Bullets, @@ -15,12 +16,6 @@ pub enum TableMode { Off, } -impl Default for TableMode { - fn default() -> Self { - Self::Code - } -} - impl fmt::Display for TableMode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -122,10 +117,8 @@ fn parse_segments(markdown: &str, mode: TableMode) -> Vec { Event::Start(Tag::TableRow) => { current_row.clear(); } - Event::End(TagEnd::TableRow) => { - if !in_head { - rows.push(std::mem::take(&mut current_row)); - } + Event::End(TagEnd::TableRow) if !in_head => { + rows.push(std::mem::take(&mut current_row)); } Event::Start(Tag::TableCell) => { cell_buf.clear(); @@ -242,12 +235,12 @@ fn render_table_code(table: &Table, out: &mut String) { fn write_row(out: &mut String, cells: &[String], widths: &[usize], col_count: usize) { out.push('|'); - for i in 0..col_count { + for (i, w) in widths.iter().enumerate().take(col_count) { out.push(' '); let cell = cells.get(i).map(|s| s.as_str()).unwrap_or(""); out.push_str(cell); let display_width = UnicodeWidthStr::width(cell); - let pad = widths[i].saturating_sub(display_width); + let pad = w.saturating_sub(display_width); for _ in 0..pad { out.push(' '); } From 6f9afc6fb6e4f1d6ffc8d022f69e5b922bc93e19 Mon Sep 17 00:00:00 2001 From: JARVIS-coding-Agent Date: Fri, 24 Apr 2026 14:51:43 +0000 Subject: [PATCH 3/6] fix(format): enforce chunk <= limit invariant in fence-aware split_message - Reserve CLOSE_COST (4 chars) in overflow check when inside a fence, so the \n``` close marker never pushes a chunk past the limit - Use fence_opener: Option instead of bool to preserve the full opener line (e.g. ```rust) on reopen across chunk boundaries - Reopen path pushes line directly + continues, avoiding the double newline that caused reopen-path overflow - Hard-split path is now fence-aware: close/reopen with proper capacity = limit - opener_len - 5, with fallback for tiny limits - Close any trailing open fence in the final chunk - Add 10 unit tests asserting chunk.chars().count() <= limit for all boundary cases: fenced splits, hard-splits, language tag preservation, close-overhead budgeting, fence balance across chunks Addresses reviewer feedback from shaun-agent on #513. --- src/format.rs | 308 +++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 280 insertions(+), 28 deletions(-) diff --git a/src/format.rs b/src/format.rs index 693f6ea8..e9865f2e 100644 --- a/src/format.rs +++ b/src/format.rs @@ -3,7 +3,9 @@ /// /// Fenced code blocks (``` ... ```) are handled specially: if a split falls inside a /// code block, the current chunk is closed with ``` and the next chunk is reopened with -/// ```, so each chunk renders correctly in Discord. +/// the original opener (preserving language tag), so each chunk renders correctly. +/// +/// Invariant: every returned chunk satisfies `chunk.chars().count() <= limit`. pub fn split_message(text: &str, limit: usize) -> Vec { if text.chars().count() <= limit { return vec![text.to_string()]; @@ -12,54 +14,153 @@ pub fn split_message(text: &str, limit: usize) -> Vec { let mut chunks = Vec::new(); let mut current = String::new(); let mut current_len: usize = 0; - let mut in_code_fence = false; + // When inside a fenced code block, holds the full opener line (e.g. "```rust"). + let mut fence_opener: Option = None; + + // Cost of appending "\n```" to close a fence before emitting a chunk. + const CLOSE_COST: usize = 4; // '\n' + '`' + '`' + '`' for line in text.split('\n') { let line_chars = line.chars().count(); - let is_fence_marker = line.starts_with("```"); + let is_fence_line = line.starts_with("```"); - // +1 for the newline - if !current.is_empty() && current_len + line_chars + 1 > limit { - if in_code_fence && !is_fence_marker { - // Close the open code fence so this chunk renders correctly. - current.push_str("\n```"); - } - chunks.push(current); - current = String::new(); - current_len = 0; - if in_code_fence && !is_fence_marker { - // Reopen the code fence in the new chunk. - current.push_str("```"); - current_len = 3; + // Determine overhead that must be reserved when inside a fence. + let close_reserve = if fence_opener.is_some() && !is_fence_line { + CLOSE_COST + } else { + 0 + }; + + // Check whether appending this line (+ newline separator + close reserve) overflows. + if !current.is_empty() && current_len + 1 + line_chars + close_reserve > limit { + // Emit current chunk, closing fence if needed. + if let Some(ref opener) = fence_opener { + if !is_fence_line { + current.push_str("\n```"); + } + chunks.push(std::mem::take(&mut current)); + // Reopen fence in next chunk with full opener (preserves language tag). + current.push_str(opener); + current_len = opener.chars().count(); + + if is_fence_line { + // The closing fence marker itself triggers the split. + fence_opener = None; + current.push('\n'); + current_len += 1; + current.push_str(line); + current_len += line_chars; + continue; + } else if current_len + 1 + line_chars + CLOSE_COST <= limit { + // Line fits in the reopened chunk (with room for \n + line + close marker). + current.push('\n'); + current_len += 1; + current.push_str(line); + current_len += line_chars; + continue; + } + // Otherwise: line doesn't fit even in a fresh reopened chunk. + // Fall through to the normal line-processing logic below, + // which will hit the hard-split path if line_chars > limit, + // or the normal append path otherwise. + } else { + chunks.push(std::mem::take(&mut current)); + current_len = 0; } } + // Newline separator between lines within a chunk. if !current.is_empty() { current.push('\n'); current_len += 1; } - if is_fence_marker { - in_code_fence = !in_code_fence; + // Track fence state. + if is_fence_line { + if fence_opener.is_some() { + fence_opener = None; + } else { + fence_opener = Some(line.to_string()); + } } - // If a single line exceeds limit, hard-split on char boundaries - if line_chars > limit { - for ch in line.chars() { - if current_len + 1 > limit { - chunks.push(current); - current = String::new(); - current_len = 0; + // Hard-split: single line exceeds available space. + // This triggers when the line itself is longer than limit, OR when the + // line doesn't fit in the current chunk even after accounting for fence + // close overhead (e.g. after a reopen where opener already consumed space). + let effective_avail = if fence_opener.is_some() { + limit.saturating_sub(current_len + CLOSE_COST) + } else { + limit.saturating_sub(current_len) + }; + if line_chars > effective_avail { + let overhead = if let Some(ref opener) = fence_opener { + // opener + '\n' at start, '\n```' at end + opener.chars().count() + 1 + CLOSE_COST + } else { + 0 + }; + // If limit can't even fit overhead, fall back to unfenced hard-split. + let capacity = limit.saturating_sub(overhead); + if let Some(opener) = fence_opener.as_ref().filter(|_| capacity > 0) { + // Fenced hard-split: each mid chunk = opener\n + chars + \n``` + let opener_len = opener.chars().count(); + let mut chars = line.chars().peekable(); + + // Fill remaining space in current chunk first. + let avail_first = if current_len > 0 { + limit.saturating_sub(current_len + CLOSE_COST) + } else { + capacity + }; + for _ in 0..avail_first { + if let Some(ch) = chars.next() { + current.push(ch); + current_len += 1; + } else { + break; + } + } + + while chars.peek().is_some() { + // Close current fenced chunk. + current.push_str("\n```"); + chunks.push(std::mem::take(&mut current)); + // Reopen. + current.push_str(opener); + current.push('\n'); + current_len = opener_len + 1; + for _ in 0..capacity { + if let Some(ch) = chars.next() { + current.push(ch); + current_len += 1; + } else { + break; + } + } + } + } else { + // Plain hard-split (no fence or limit too small for fence wrapping). + for ch in line.chars() { + if current_len >= limit { + chunks.push(std::mem::take(&mut current)); + current_len = 0; + } + current.push(ch); + current_len += 1; } - current.push(ch); - current_len += 1; } } else { current.push_str(line); current_len += line_chars; } } + if !current.is_empty() { + // Close any trailing open fence. + if fence_opener.is_some() { + current.push_str("\n```"); + } chunks.push(current); } chunks @@ -82,7 +183,6 @@ pub fn shorten_thread_name(prompt: &str) -> String { } } - /// Truncate a string to at most `limit` Unicode characters, keeping the tail /// (most recent output) for better streaming UX. pub fn truncate_chars_tail(s: &str, limit: usize) -> String { @@ -92,3 +192,155 @@ pub fn truncate_chars_tail(s: &str, limit: usize) -> String { } s.chars().skip(total - limit).collect() } + +#[cfg(test)] +mod tests { + use super::*; + + /// Helper: assert every chunk respects the limit. + fn assert_length_invariant(chunks: &[String], limit: usize) { + for (i, chunk) in chunks.iter().enumerate() { + let len = chunk.chars().count(); + assert!( + len <= limit, + "chunk {i} has {len} chars, exceeds limit {limit}:\n{chunk}" + ); + } + } + + /// Helper: assert concatenating all chunks (with empty joiner) reconstructs + /// content that, when fence wrappers are stripped, equals the original. + fn assert_roundtrip(original: &str, chunks: &[String]) { + let joined = chunks.join(""); + // Strip all auto-inserted close/reopen pairs: "\n```\n```..." pattern + let cleaned = joined + .replace("\n```\n```", "\n") + .replace("\n```\n```", "\n"); + // Loose check: all original non-fence content is present + for line in original.split('\n') { + if !line.starts_with("```") { + assert!( + cleaned.contains(line), + "original line missing from output: {line}" + ); + } + } + } + + #[test] + fn no_split_under_limit() { + let text = "hello\nworld"; + let chunks = split_message(text, 100); + assert_eq!(chunks.len(), 1); + assert_eq!(chunks[0], text); + } + + #[test] + fn plain_text_split_respects_limit() { + let text = "aaaa\nbbbb\ncccc\ndddd"; + let chunks = split_message(text, 10); + assert_length_invariant(&chunks, 10); + assert!(chunks.len() > 1); + } + + #[test] + fn fenced_split_preserves_language_tag() { + // ```rust\n + 1990 chars of content + \n``` — should split + let content_line = "x".repeat(1990); + let text = format!("```rust\n{content_line}\nanother line here\n```"); + let chunks = split_message(&text, 2000); + assert_length_invariant(&chunks, 2000); + // First chunk should start with ```rust + assert!(chunks[0].starts_with("```rust")); + // If split happened, second chunk should reopen with ```rust + if chunks.len() > 1 { + assert!( + chunks[1].starts_with("```rust"), + "second chunk should reopen with language tag: {}", + &chunks[1][..chunks[1].len().min(20)] + ); + } + } + + #[test] + fn fenced_split_close_overhead_budgeted() { + // Construct a fenced block where content + close marker would overflow + // without proper budgeting. + // limit=50, opener="```" (3), close="\n```" (4) + // Available for content per chunk: 50 - 3 - 1 - 4 = 42 (with opener+newline+close) + let line1 = "a".repeat(40); + let line2 = "b".repeat(40); + let text = format!("```\n{line1}\n{line2}\n```"); + let chunks = split_message(&text, 50); + assert_length_invariant(&chunks, 50); + } + + #[test] + fn reopen_path_no_overflow() { + // Regression: limit=2000, fenced block with a 1996-char line. + // Old code would produce 2004-char chunk due to reopen + extra \n. + let content = "x".repeat(1990); + let text = format!("```rust\n{content}\nshort\n```"); + let chunks = split_message(&text, 2000); + assert_length_invariant(&chunks, 2000); + } + + #[test] + fn hard_split_fenced_respects_limit() { + // A single very long line inside a fence. + let long_line = "x".repeat(100); + let text = format!("```\n{long_line}\n```"); + let chunks = split_message(&text, 20); + assert_length_invariant(&chunks, 20); + // All content should be present + let total_x: usize = chunks + .iter() + .map(|c| c.chars().filter(|&ch| ch == 'x').count()) + .sum(); + assert_eq!(total_x, 100); + } + + #[test] + fn hard_split_plain_respects_limit() { + let long_line = "y".repeat(50); + let text = format!("before\n{long_line}\nafter"); + let chunks = split_message(&text, 10); + assert_length_invariant(&chunks, 10); + } + + #[test] + fn closing_fence_triggers_split() { + // The closing ``` itself pushes over the limit. + let content = "a".repeat(44); + // "```\n" + 44 chars + "\n```" = 3 + 1 + 44 + 1 + 3 = 52 + let text = format!("```\n{content}\n```"); + let chunks = split_message(&text, 50); + assert_length_invariant(&chunks, 50); + } + + #[test] + fn multi_fence_blocks() { + let text = "text\n```python\ncode1\ncode2\n```\nmore text\n```js\ncode3\n```"; + let chunks = split_message(text, 25); + assert_length_invariant(&chunks, 25); + } + + #[test] + fn fence_balance_across_chunks() { + // Every chunk should have balanced fences (even number of ``` lines). + let content = (0..20) + .map(|i| format!("line {i}")) + .collect::>() + .join("\n"); + let text = format!("```\n{content}\n```"); + let chunks = split_message(&text, 30); + assert_length_invariant(&chunks, 30); + for (i, chunk) in chunks.iter().enumerate() { + let fence_count = chunk.lines().filter(|l| l.starts_with("```")).count(); + assert!( + fence_count % 2 == 0, + "chunk {i} has unbalanced fences ({fence_count}):\n{chunk}" + ); + } + } +} From 124eb78319fdae172bd32bc3933f3b04716744da Mon Sep 17 00:00:00 2001 From: JARVIS-coding-Agent Date: Sat, 25 Apr 2026 07:14:00 +0000 Subject: [PATCH 4/6] fix(review): decouple parse_segments from renderer, remove dead assert_roundtrip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address FRIDAY's two Moderate review findings on #513: 1. parse_segments no longer takes mode param — IR always preserves backticks on Code tokens. render_table_code strips them at render time. Future modes only need renderer changes, not parser changes. 2. Remove assert_roundtrip from format.rs tests — never called by any #[test], and its replace pattern failed to match language-tagged fence reopens (e.g. ```rust). --- src/format.rs | 19 ------------------- src/markdown.rs | 31 +++++++++++++++++-------------- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/src/format.rs b/src/format.rs index e9865f2e..d39410f1 100644 --- a/src/format.rs +++ b/src/format.rs @@ -208,25 +208,6 @@ mod tests { } } - /// Helper: assert concatenating all chunks (with empty joiner) reconstructs - /// content that, when fence wrappers are stripped, equals the original. - fn assert_roundtrip(original: &str, chunks: &[String]) { - let joined = chunks.join(""); - // Strip all auto-inserted close/reopen pairs: "\n```\n```..." pattern - let cleaned = joined - .replace("\n```\n```", "\n") - .replace("\n```\n```", "\n"); - // Loose check: all original non-fence content is present - for line in original.split('\n') { - if !line.starts_with("```") { - assert!( - cleaned.contains(line), - "original line missing from output: {line}" - ); - } - } - } - #[test] fn no_split_under_limit() { let text = "hello\nworld"; diff --git a/src/markdown.rs b/src/markdown.rs index 7ed4bbf4..6b0aa533 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -49,7 +49,7 @@ pub fn convert_tables(markdown: &str, mode: TableMode) -> String { return markdown.to_string(); } - let segments = parse_segments(markdown, mode); + let segments = parse_segments(markdown); let mut out = String::with_capacity(markdown.len()); for seg in segments { @@ -69,7 +69,7 @@ pub fn convert_tables(markdown: &str, mode: TableMode) -> String { /// Walk the markdown source with pulldown-cmark and split it into /// text segments and parsed Table segments. -fn parse_segments(markdown: &str, mode: TableMode) -> Vec { +fn parse_segments(markdown: &str) -> Vec { let mut opts = Options::empty(); opts.insert(Options::ENABLE_TABLES); @@ -131,15 +131,9 @@ fn parse_segments(markdown: &str, mode: TableMode) -> Vec { cell_buf.push_str(&t); } Event::Code(t) if in_table => { - // In Code mode the table is already inside a fenced code block, - // so backticks would render as literal characters. Strip them. - if mode != TableMode::Code { - cell_buf.push('`'); - } + cell_buf.push('`'); cell_buf.push_str(&t); - if mode != TableMode::Code { - cell_buf.push('`'); - } + cell_buf.push('`'); } // Inline markup inside cells: collect text, ignore tags Event::SoftBreak if in_table => { @@ -194,12 +188,21 @@ fn render_table_code(table: &Table, out: &mut String) { return; } + // Strip backticks from cells — inside a code fence they render as literals. + let strip = |s: &str| s.replace('`', ""); + let headers: Vec = table.headers.iter().map(|h| strip(h)).collect(); + let rows: Vec> = table + .rows + .iter() + .map(|r| r.iter().map(|c| strip(c)).collect()) + .collect(); + // Compute column widths (using display width for CJK/emoji) let mut widths = vec![0usize; col_count]; - for (i, h) in table.headers.iter().enumerate() { + for (i, h) in headers.iter().enumerate() { widths[i] = widths[i].max(UnicodeWidthStr::width(h.as_str())); } - for row in &table.rows { + for row in &rows { for (i, cell) in row.iter().enumerate() { if i < col_count { widths[i] = widths[i].max(UnicodeWidthStr::width(cell.as_str())); @@ -214,7 +217,7 @@ fn render_table_code(table: &Table, out: &mut String) { out.push_str("```\n"); // Header row - write_row(out, &table.headers, &widths, col_count); + write_row(out, &headers, &widths, col_count); // Divider out.push('|'); for w in &widths { @@ -226,7 +229,7 @@ fn render_table_code(table: &Table, out: &mut String) { } out.push('\n'); // Data rows - for row in &table.rows { + for row in &rows { write_row(out, row, &widths, col_count); } From c71e581fac53e90367741d5576966088907c625c Mon Sep 17 00:00:00 2001 From: shaun-agent Date: Sat, 25 Apr 2026 16:52:54 +0000 Subject: [PATCH 5/6] fix(deps): sync Cargo.lock for markdown deps --- Cargo.lock | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 0d9c8034..7092ac21 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -457,6 +457,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -981,6 +990,7 @@ dependencies = [ "futures-util", "image", "libc", + "pulldown-cmark", "rand 0.8.5", "regex", "reqwest", @@ -1089,6 +1099,25 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "pulldown-cmark" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" +dependencies = [ + "bitflags", + "getopts", + "memchr", + "pulldown-cmark-escape", + "unicase", +] + +[[package]] +name = "pulldown-cmark-escape" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae" + [[package]] name = "pxfm" version = "0.1.28" From 000315527a0cdc42f47f3f7b888eaf53026ef22c Mon Sep 17 00:00:00 2001 From: JARVIS-coding-Agent Date: Mon, 27 Apr 2026 09:19:35 +0000 Subject: [PATCH 6/6] fix(deps): disable pulldown-cmark default features Only the core parser API is used (Event, Options, Parser, Tag, TagEnd). Default features include getopts (CLI binary) and html (HTML rendering) which are not needed, adding unnecessary transitive dependencies. --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index a8b2f48a..87c1f6db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,7 +24,7 @@ reqwest = { version = "0.12", default-features = false, features = ["rustls-tls" base64 = "0.22" image = { version = "0.25", default-features = false, features = ["jpeg", "png", "gif", "webp"] } unicode-width = "0.2" -pulldown-cmark = "0.13" +pulldown-cmark = { version = "0.13", default-features = false } tokio-tungstenite = { version = "0.21", features = ["rustls-tls-webpki-roots"] } [target.'cfg(unix)'.dependencies]