diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml index 49758ac260e..d7d97198023 100644 --- a/.github/workflows/fuzz.yml +++ b/.github/workflows/fuzz.yml @@ -99,6 +99,42 @@ jobs: gh_token: ${{ secrets.GITHUB_TOKEN }} incident_io_alert_token: ${{ secrets.INCIDENT_IO_ALERT_TOKEN }} + # ============================================================================ + # FSST LIKE Fuzzer + # ============================================================================ + fsst_like_fuzz: + name: "FSST LIKE Fuzz" + uses: ./.github/workflows/run-fuzzer.yml + with: + fuzz_target: fsst_like + jobs: 16 + secrets: + R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }} + R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }} + + report-fsst-like-fuzz-failures: + name: "Report FSST LIKE Fuzz Failures" + needs: fsst_like_fuzz + if: always() && needs.fsst_like_fuzz.outputs.crashes_found == 'true' + permissions: + issues: write + contents: read + id-token: write + pull-requests: read + uses: ./.github/workflows/report-fuzz-crash.yml + with: + fuzz_target: fsst_like + crash_file: ${{ needs.fsst_like_fuzz.outputs.first_crash_name }} + artifact_url: ${{ needs.fsst_like_fuzz.outputs.artifact_url }} + artifact_name: fsst_like-crash-artifacts + logs_artifact_name: fsst_like-logs + branch: ${{ github.ref_name }} + commit: ${{ github.sha }} + secrets: + claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }} + gh_token: ${{ secrets.GITHUB_TOKEN }} + incident_io_alert_token: ${{ secrets.INCIDENT_IO_ALERT_TOKEN }} + # ============================================================================ # Compress Roundtrip Fuzzer # ============================================================================ diff --git a/Cargo.lock b/Cargo.lock index f3d1d80a2da..bc33ddb3d75 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10219,6 +10219,7 @@ dependencies = [ "vortex-cuda", "vortex-error", "vortex-file", + "vortex-fsst", "vortex-io", "vortex-mask", "vortex-runend", diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index eb08bbda959..b95eeb1f444 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -41,7 +41,7 @@ harness = false required-features = ["_test-harness"] [[bench]] -name = "fsst_contains" +name = "fsst_like" harness = false required-features = ["_test-harness"] diff --git a/encodings/fsst/README.md b/encodings/fsst/README.md index 0e08c6e7fc8..7cc53ba07f9 100644 --- a/encodings/fsst/README.md +++ b/encodings/fsst/README.md @@ -2,3 +2,23 @@ A Vortex Encoding for Binary and Utf8 data that utilizes the [Fast Static Symbol Table](https://github.com/spiraldb/fsst) compression algorithm. + +## LIKE Pushdown + +The FSST encoding has a specialized LIKE fast path for a narrow subset of +patterns: + +- `prefix%` +- `%needle%` + +Unsupported shapes, including `_`, `%suffix`, or patterns with interior +wildcards, fall back to ordinary decompression-based LIKE evaluation. + +There are also two implementation limits on the pushdown path, both measured in +pattern bytes: + +- `prefix%` supports up to 253 bytes. +- `%needle%` supports up to 254 bytes. + +Patterns beyond those limits are still evaluated correctly, but they do so via +the fallback path instead of the DFA matcher. diff --git a/encodings/fsst/benches/fsst_contains.rs b/encodings/fsst/benches/fsst_like.rs similarity index 79% rename from encodings/fsst/benches/fsst_contains.rs rename to encodings/fsst/benches/fsst_like.rs index 6885ad0543e..12e78e2d7fb 100644 --- a/encodings/fsst/benches/fsst_contains.rs +++ b/encodings/fsst/benches/fsst_like.rs @@ -80,7 +80,19 @@ impl Dataset { } } - fn pattern(&self) -> &'static str { + fn prefix_pattern(&self) -> &'static str { + match self { + Self::Urls => "https%", + Self::Cb => "https://www.%", + Self::Log => "192.168%", + Self::Json => r#"{"id%"#, + Self::Path => "/home%", + Self::Email => "john%", + Self::Rare => "xyz%", + } + } + + fn contains_pattern(&self) -> &'static str { match self { Self::Urls => "%google%", Self::Cb => "%yandex%", @@ -93,15 +105,10 @@ impl Dataset { } } -#[divan::bench(args = [ - Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json, - Dataset::Path, Dataset::Email, Dataset::Rare, -])] -fn fsst_like(bencher: Bencher, dataset: &Dataset) { - let fsst = dataset.fsst_array(); +fn bench_like(bencher: Bencher, fsst: &FSSTArray, pattern: &str) { let len = fsst.len(); let arr = fsst.clone().into_array(); - let pattern = ConstantArray::new(dataset.pattern(), len).into_array(); + let pattern = ConstantArray::new(pattern, len).into_array(); bencher.bench_local(|| { Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()]) .unwrap() @@ -110,3 +117,19 @@ fn fsst_like(bencher: Bencher, dataset: &Dataset) { .unwrap() }); } + +#[divan::bench(args = [ + Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json, + Dataset::Path, Dataset::Email, Dataset::Rare, +])] +fn fsst_prefix(bencher: Bencher, dataset: &Dataset) { + bench_like(bencher, dataset.fsst_array(), dataset.prefix_pattern()); +} + +#[divan::bench(args = [ + Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json, + Dataset::Path, Dataset::Email, Dataset::Rare, +])] +fn fsst_contains(bencher: Bencher, dataset: &Dataset) { + bench_like(bencher, dataset.fsst_array(), dataset.contains_pattern()); +} diff --git a/encodings/fsst/public-api.lock b/encodings/fsst/public-api.lock index c25ba6b44f2..c7f958d609c 100644 --- a/encodings/fsst/public-api.lock +++ b/encodings/fsst/public-api.lock @@ -30,6 +30,10 @@ impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_fsst::FSS pub fn vortex_fsst::FSST::cast(array: &vortex_fsst::FSSTArray, dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult> +impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_fsst::FSST + +pub fn vortex_fsst::FSST::like(array: &vortex_fsst::FSSTArray, pattern: &vortex_array::array::ArrayRef, options: vortex_array::scalar_fn::fns::like::LikeOptions, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::vtable::VTable for vortex_fsst::FSST pub type vortex_fsst::FSST::Array = vortex_fsst::FSSTArray diff --git a/encodings/fsst/src/compute/like.rs b/encodings/fsst/src/compute/like.rs new file mode 100644 index 00000000000..732708a64c1 --- /dev/null +++ b/encodings/fsst/src/compute/like.rs @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow(clippy::cast_possible_truncation)] + +use vortex_array::ArrayRef; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::ToCanonical; +use vortex_array::arrays::BoolArray; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_error::VortexResult; + +use crate::FSST; +use crate::FSSTArray; +use crate::dfa::FsstMatcher; +use crate::dfa::dfa_scan_to_bitbuf; + +impl LikeKernel for FSST { + fn like( + array: &FSSTArray, + pattern: &ArrayRef, + options: LikeOptions, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let Some(pattern_scalar) = pattern.as_constant() else { + return Ok(None); + }; + + if options.case_insensitive { + return Ok(None); + } + + let Some(pattern_str) = pattern_scalar.as_utf8().value() else { + return Ok(None); + }; + + let symbols = array.symbols(); + let symbol_lengths = array.symbol_lengths(); + + let Some(matcher) = + FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_str)? + else { + return Ok(None); + }; + + let negated = options.negated; + let codes = array.codes(); + let offsets = codes.offsets().to_primitive(); + let all_bytes = codes.bytes(); + let all_bytes = all_bytes.as_slice(); + let n = codes.len(); + + let result = match_each_integer_ptype!(offsets.ptype(), |T| { + let off = offsets.as_slice::(); + dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes)) + }); + + // FSST delegates validity to its codes array, so we can read it + // directly without cloning the entire FSSTArray into an ArrayRef. + let validity = array + .codes() + .validity()? + .union_nullability(pattern_scalar.dtype().nullability()); + + Ok(Some(BoolArray::new(result, validity).into_array())) + } +} + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use rand::Rng; + use rand::SeedableRng; + use rand::rngs::StdRng; + use vortex_array::Canonical; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::VarBinArray; + use vortex_array::arrays::scalar_fn::ScalarFnArrayExt; + use vortex_array::assert_arrays_eq; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::scalar_fn::fns::like::Like; + use vortex_array::scalar_fn::fns::like::LikeKernel; + use vortex_array::scalar_fn::fns::like::LikeOptions; + use vortex_array::session::ArraySession; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use crate::FSST; + use crate::FSSTArray; + use crate::fsst_compress; + use crate::fsst_train_compressor; + + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + + fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray { + let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability)); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) + } + + fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult { + let len = array.len(); + let arr = array.into_array(); + let pattern = ConstantArray::new(pattern, len).into_array(); + let result = Like + .try_new_array(len, opts, [arr, pattern])? + .into_array() + .execute::(&mut SESSION.create_execution_ctx())?; + Ok(result.into_bool()) + } + + fn like(array: FSSTArray, pattern: &str) -> VortexResult { + run_like(array, pattern, LikeOptions::default()) + } + + #[test] + fn test_like_prefix() -> VortexResult<()> { + let fsst = make_fsst( + &[ + Some("http://example.com"), + Some("http://test.org"), + Some("ftp://files.net"), + Some("http://vortex.dev"), + Some("ssh://server.io"), + ], + Nullability::NonNullable, + ); + let result = like(fsst, "http%")?; + assert_arrays_eq!( + &result, + &BoolArray::from_iter([true, true, false, true, false]) + ); + Ok(()) + } + + #[test] + fn test_like_prefix_with_nulls() -> VortexResult<()> { + let fsst = make_fsst( + &[Some("hello"), None, Some("help"), None, Some("goodbye")], + Nullability::Nullable, + ); + let result = like(fsst, "hel%")?; // spellchecker:disable-line + assert_arrays_eq!( + &result, + &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)]) + ); + Ok(()) + } + + #[test] + fn test_like_contains() -> VortexResult<()> { + let fsst = make_fsst( + &[ + Some("hello world"), + Some("say hello"), + Some("goodbye"), + Some("hellooo"), + ], + Nullability::NonNullable, + ); + let result = like(fsst, "%hello%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true])); + Ok(()) + } + + #[test] + fn test_like_contains_cross_symbol() -> VortexResult<()> { + let fsst = make_fsst( + &[ + Some("the quick brown fox jumps over the lazy dog"), + Some("a short string"), + Some("the lazy dog sleeps"), + Some("no match"), + ], + Nullability::NonNullable, + ); + let result = like(fsst, "%lazy dog%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false])); + Ok(()) + } + + #[test] + fn test_not_like_contains() -> VortexResult<()> { + let fsst = make_fsst( + &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")], + Nullability::NonNullable, + ); + let opts = LikeOptions { + negated: true, + case_insensitive: false, + }; + let result = run_like(fsst, "%sdf%", opts)?; + assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true])); + Ok(()) + } + + #[test] + fn test_like_match_all() -> VortexResult<()> { + let fsst = make_fsst( + &[Some("abc"), Some(""), Some("xyz")], + Nullability::NonNullable, + ); + let result = like(fsst, "%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true])); + Ok(()) + } + + /// Call `LikeKernel::like` directly on the FSSTArray and verify it + /// returns `Some(...)` (i.e. the kernel handles it, rather than + /// returning `None` which would mean "fall back to decompress"). + #[test] + fn test_like_prefix_kernel_handles() -> VortexResult<()> { + let fsst = make_fsst( + &[Some("http://a.com"), Some("ftp://b.com")], + Nullability::NonNullable, + ); + let pattern = ConstantArray::new("http%", fsst.len()).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + let result = ::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_some(), "FSST LikeKernel should handle prefix%"); + assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false])); + Ok(()) + } + + /// Same direct-call check for the contains pattern `%needle%`. + #[test] + fn test_like_contains_kernel_handles() -> VortexResult<()> { + let fsst = make_fsst( + &[Some("hello world"), Some("goodbye")], + Nullability::NonNullable, + ); + let pattern = ConstantArray::new("%world%", fsst.len()).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + let result = ::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_some(), "FSST LikeKernel should handle %needle%"); + assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false])); + Ok(()) + } + + /// Patterns we can't handle should return `None` (fall back). + #[test] + fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> { + let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + + // Underscore wildcard -- not handled. + let pattern = ConstantArray::new("a_c", fsst.len()).into_array(); + let result = ::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_none(), "underscore pattern should fall back"); + + // Case-insensitive -- not handled. + let pattern = ConstantArray::new("abc%", fsst.len()).into_array(); + let opts = LikeOptions { + negated: false, + case_insensitive: true, + }; + let result = ::like(&fsst, &pattern, opts, &mut ctx)?; + assert!(result.is_none(), "ilike should fall back"); + + Ok(()) + } + + #[test] + fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> { + let fsst = make_fsst( + &[ + Some("abcdefghijklmn-tail"), + Some("abcdefghijklmx-tail"), + Some("abcdefghijklmn"), + ], + Nullability::NonNullable, + ); + let pattern = "abcdefghijklmn%"; + + let direct = ::like( + &fsst, + &ConstantArray::new(pattern, fsst.len()).into_array(), + LikeOptions::default(), + &mut SESSION.create_execution_ctx(), + )?; + assert!( + direct.is_some(), + "14-byte prefixes are now handled by the flat prefix DFA" + ); + assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true])); + Ok(()) + } + + #[test] + fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> { + let needle = "a".repeat(255); + let matching = format!("xx{needle}yy"); + let non_matching = format!("xx{}byy", "a".repeat(254)); + let exact = needle.clone(); + let pattern = format!("%{needle}%"); + + let fsst = make_fsst( + &[Some(&matching), Some(&non_matching), Some(&exact)], + Nullability::NonNullable, + ); + + let direct = ::like( + &fsst, + &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(), + LikeOptions::default(), + &mut SESSION.create_execution_ctx(), + )?; + assert!( + direct.is_none(), + "contains needles longer than 254 bytes exceed the DFA's u8 state space" + ); + + let result = like(fsst, &pattern)?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true])); + Ok(()) + } + + #[test] + fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> { + let needle = "a".repeat(254); + let matching = format!("xx{needle}yy"); + let non_matching = format!("xx{}byy", "a".repeat(253)); + let pattern = format!("%{needle}%"); + + let fsst = make_fsst( + &[Some(&matching), Some(&non_matching), Some(needle.as_str())], + Nullability::NonNullable, + ); + + let direct = ::like( + &fsst, + &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(), + LikeOptions::default(), + &mut SESSION.create_execution_ctx(), + )?; + assert!( + direct.is_some(), + "254-byte contains needle should stay on the DFA path" + ); + assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true])); + Ok(()) + } + + // ----------------------------------------------------------------------- + // Fuzz tests: compare FSST kernel against naive string matching + // ----------------------------------------------------------------------- + + fn random_string(rng: &mut StdRng, max_len: usize) -> String { + let len = rng.random_range(0..=max_len); + // Use a small alphabet to increase substring hit rate. + (0..len) + .map(|_| (b'a' + rng.random_range(0..6u8)) as char) + .collect() + } + + fn fuzz_contains(seed: u64, needle_len: usize, n_strings: usize) -> VortexResult<()> { + let mut rng = StdRng::seed_from_u64(seed); + + let needle: String = (0..needle_len) + .map(|_| (b'a' + rng.random_range(0..6u8)) as char) + .collect(); + + let owned: Vec = (0..n_strings) + .map(|_| random_string(&mut rng, 80)) + .collect(); + let strings: Vec> = owned.iter().map(|s| Some(s.as_str())).collect(); + + let expected: Vec = owned.iter().map(|s| s.contains(&needle)).collect(); + + let fsst = make_fsst(&strings, Nullability::NonNullable); + let pattern = format!("%{needle}%"); + let result = run_like(fsst, &pattern, LikeOptions::default())?; + + let got: Vec = (0..n_strings) + .map(|i| result.to_bit_buffer().value(i)) + .collect(); + + for (i, (e, g)) in expected.iter().zip(got.iter()).enumerate() { + assert_eq!( + e, g, + "mismatch at index {i}: string={:?}, needle={needle:?}, expected={e}, got={g}", + &owned[i], + ); + } + Ok(()) + } + + fn fuzz_prefix(seed: u64, prefix_len: usize, n_strings: usize) -> VortexResult<()> { + let mut rng = StdRng::seed_from_u64(seed); + + let prefix: String = (0..prefix_len) + .map(|_| (b'a' + rng.random_range(0..6u8)) as char) + .collect(); + + let owned: Vec = (0..n_strings) + .map(|_| random_string(&mut rng, 80)) + .collect(); + let strings: Vec> = owned.iter().map(|s| Some(s.as_str())).collect(); + + let expected: Vec = owned.iter().map(|s| s.starts_with(&prefix)).collect(); + + let fsst = make_fsst(&strings, Nullability::NonNullable); + let pattern = format!("{prefix}%"); + let result = run_like(fsst, &pattern, LikeOptions::default())?; + + let got: Vec = (0..n_strings) + .map(|i| result.to_bit_buffer().value(i)) + .collect(); + + for (i, (e, g)) in expected.iter().zip(got.iter()).enumerate() { + assert_eq!( + e, g, + "mismatch at index {i}: string={:?}, prefix={prefix:?}, expected={e}, got={g}", + &owned[i], + ); + } + Ok(()) + } + + /// Fuzz contains with short needles (1-7 chars) -> BranchlessShiftDfa + #[test] + fn fuzz_contains_short_needle() -> VortexResult<()> { + for seed in 0..50 { + for needle_len in 1..=7 { + fuzz_contains(seed, needle_len, 200)?; + } + } + Ok(()) + } + + /// Fuzz contains with medium needles (8-14 chars) -> FlatBranchlessDfa + #[test] + fn fuzz_contains_medium_needle() -> VortexResult<()> { + for seed in 0..50 { + for needle_len in [8, 10, 14] { + fuzz_contains(seed, needle_len, 200)?; + } + } + Ok(()) + } + + /// Fuzz contains with long needles (>14 chars) -> FsstContainsDfa + #[test] + fn fuzz_contains_long_needle() -> VortexResult<()> { + for seed in 0..30 { + for needle_len in [15, 20, 30] { + fuzz_contains(seed, needle_len, 200)?; + } + } + Ok(()) + } + + /// Fuzz prefix matching + #[test] + fn fuzz_prefix_matching() -> VortexResult<()> { + for seed in 0..50 { + for prefix_len in [1, 3, 5, 10, 13, 20, 40] { + fuzz_prefix(seed, prefix_len, 200)?; + } + } + Ok(()) + } +} diff --git a/encodings/fsst/src/compute/mod.rs b/encodings/fsst/src/compute/mod.rs index 839deb6c588..f49c2954a04 100644 --- a/encodings/fsst/src/compute/mod.rs +++ b/encodings/fsst/src/compute/mod.rs @@ -4,6 +4,7 @@ mod cast; mod compare; mod filter; +mod like; use vortex_array::ArrayRef; use vortex_array::DynArray; diff --git a/encodings/fsst/src/dfa/flat_contains.rs b/encodings/fsst/src/dfa/flat_contains.rs new file mode 100644 index 00000000000..64ce71117fa --- /dev/null +++ b/encodings/fsst/src/dfa/flat_contains.rs @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Flat `u8` transition table DFA for contains matching (`LIKE '%needle%'`). +//! +//! Uses an escape-sentinel strategy: the FSST escape code maps to a sentinel +//! state, and the next literal byte is looked up in a separate byte-level +//! transition table. +//! +//! ## Construction (needle = `"aba"`, symbols = `[0:"ab", 1:"ba"]`) +//! +//! ### Step 1: KMP byte-level transition table +//! +//! Build a `(state × byte) → state` table using the KMP failure function. +//! States 0..2 track match progress, state 3 is accept (sticky). +//! +//! ```text +//! Input byte +//! State 'a' 'b' other +//! ───── ──── ──── ───── +//! 0 1 0 0 ← want 'a' +//! 1 1 2 0 ← matched "a", want 'b' (KMP: 'a'→stay at 1) +//! 2 3✓ 0 0 ← matched "ab", want 'a' +//! 3✓ 3✓ 3✓ 3✓ ← accept (sticky) +//! ``` +//! +//! ### Step 2: Symbol-level transitions +//! +//! For each `(state, symbol)` pair, simulate feeding the symbol's bytes +//! through the byte table: +//! +//! ```text +//! Symbol 0 = "ab" (2 bytes): +//! state 0 + 'a' → 1, + 'b' → 2 ⟹ sym_trans[0][0] = 2 +//! state 1 + 'a' → 1, + 'b' → 2 ⟹ sym_trans[1][0] = 2 +//! state 2 + 'a' → 3✓ ⟹ sym_trans[2][0] = 3✓ (accept) +//! +//! Symbol 1 = "ba" (2 bytes): +//! state 0 + 'b' → 0, + 'a' → 1 ⟹ sym_trans[0][1] = 1 +//! state 1 + 'b' → 2, + 'a' → 3✓ ⟹ sym_trans[1][1] = 3✓ (accept) +//! state 2 + 'b' → 0, + 'a' → 1 ⟹ sym_trans[2][1] = 1 +//! ``` +//! +//! ### Step 3: Fused 256-wide table with escape sentinel +//! +//! Merge symbol transitions into a 256-wide table. Code bytes 0–1 use symbol +//! transitions, code 255 (ESCAPE_CODE) maps to the sentinel (4), and +//! unused code bytes default to 0: +//! +//! ```text +//! Code byte +//! State 0("ab") 1("ba") 2..254 255(ESC) +//! ───── ─────── ─────── ────── ──────── +//! 0 2 1 0 4(S) +//! 1 2 3✓ 0 4(S) +//! 2 3✓ 1 0 4(S) +//! 3✓ 3✓ 3✓ 3✓ 3✓ +//! ``` +//! +//! When the scanner sees sentinel (4), it reads the next byte and looks it +//! up in the byte-level escape table (from step 1). +//! +//! TODO(joe): for short needles (≤7 bytes), a branchless escape-folded DFA +//! with hierarchical 4-byte composition is ~2x faster. For needles ≤127 bytes, +//! an escape-folded flat DFA (2N+1 states) avoids the sentinel branch. +//! See commit 7faf9f36f for those implementations. + +use fsst::Symbol; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use super::build_fused_table; +use super::build_symbol_transitions; +use super::kmp_byte_transitions; + +/// Flat `u8` transition table DFA for contains matching. +/// +/// The escape code maps to a sentinel state; the next literal byte is looked +/// up in a separate byte-level escape table. +pub(crate) struct FlatContainsDfa { + /// `transitions[state * 256 + byte]` -> next state. + transitions: Vec, + /// `escape_transitions[state * 256 + byte]` -> next state for escaped bytes. + escape_transitions: Vec, + accept_state: u8, + sentinel: u8, +} + +impl FlatContainsDfa { + /// Maximum needle length: need accept + sentinel to fit in u8. + pub(crate) const MAX_NEEDLE_LEN: usize = u8::MAX as usize - 1; + + pub(crate) fn new( + symbols: &[Symbol], + symbol_lengths: &[u8], + needle: &[u8], + ) -> VortexResult { + if needle.len() > Self::MAX_NEEDLE_LEN { + vortex_bail!( + "needle length {} exceeds maximum {} for flat contains DFA", + needle.len(), + Self::MAX_NEEDLE_LEN + ); + } + + let accept_state = u8::try_from(needle.len()) + .vortex_expect("FlatContainsDfa: accept state must fit into u8"); + let n_states = accept_state + 1; + let sentinel = n_states; + + let byte_table = kmp_byte_transitions(needle); + let sym_trans = + build_symbol_transitions(symbols, symbol_lengths, &byte_table, n_states, accept_state); + let transitions = build_fused_table(&sym_trans, symbols.len(), n_states, |_| sentinel, 0); + + Ok(Self { + transitions, + escape_transitions: byte_table, + accept_state, + sentinel, + }) + } + + #[inline(never)] + pub(crate) fn matches(&self, codes: &[u8]) -> bool { + let mut state = 0u8; + let mut pos = 0; + while pos < codes.len() { + let code = codes[pos]; + pos += 1; + let next = self.transitions[usize::from(state) * 256 + usize::from(code)]; + if next == self.sentinel { + if pos >= codes.len() { + return false; + } + let b = codes[pos]; + pos += 1; + state = self.escape_transitions[usize::from(state) * 256 + usize::from(b)]; + } else { + state = next; + } + if state == self.accept_state { + return true; + } + } + false + } +} diff --git a/encodings/fsst/src/dfa/mod.rs b/encodings/fsst/src/dfa/mod.rs new file mode 100644 index 00000000000..f4b73959d16 --- /dev/null +++ b/encodings/fsst/src/dfa/mod.rs @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! # FSST LIKE Pushdown via DFA Construction +//! +//! This module implements DFA-based pattern matching directly on FSST-compressed +//! strings, without decompressing them. It handles two pattern shapes: +//! +//! - **Prefix**: `'prefix%'` — matches strings starting with a literal prefix. +//! - **Contains**: `'%needle%'` — matches strings containing a literal substring. +//! +//! Pushdown is intentionally conservative. If the pattern shape is unsupported, +//! or if the pattern exceeds the DFA's representable state space, construction +//! returns `None` and the caller must fall back to ordinary decompression-based +//! LIKE evaluation. +//! +//! TODO(joe): suffix (`'%suffix'`) pushdown. Two approaches: +//! - **Forward DFA**: use a non-sticky accept state with KMP fallback transitions, +//! check `state == accept` after processing all codes. Branchless and vectorizable. +//! - **Backward scan**: walk the compressed code stream in reverse, comparing symbol +//! bytes from the end. Simpler, no DFA construction, but requires reverse parsing +//! of the FSST escape mechanism. +//! +//! ## Background: FSST Encoding +//! +//! [FSST](https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf) compresses strings by +//! replacing frequent byte sequences with single-byte **symbol codes** (0–254). Code +//! byte 255 is reserved as the **escape code**: the next byte is a literal (uncompressed) +//! byte. So a compressed string is a stream of: +//! +//! ```text +//! [symbol_code] ... [symbol_code] [ESCAPE literal_byte] [symbol_code] ... +//! ``` +//! +//! A single symbol can expand to 1–8 bytes. Matching on compressed codes requires +//! the DFA to handle multi-byte symbol expansions and the escape mechanism. +//! +//! ## The Algorithm: KMP → Byte Table → Symbol Table → Flat DFA +//! +//! Construction proceeds through four stages: +//! +//! ### Stage 1: KMP Failure Function +//! +//! We compute the standard [KMP](https://en.wikipedia.org/wiki/Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm) +//! failure function for the needle bytes. This tells us, on a mismatch at +//! position `i`, the longest proper prefix of `needle[0..i]` that is also a +//! suffix — i.e., where to resume matching instead of starting over. +//! +//! ```text +//! Needle: "abcabd" +//! Failure: [0, 0, 0, 1, 2, 0] +//! ^ ^ +//! At position 3 ('a'), the prefix "a" matches suffix "a" +//! At position 4 ('b'), the prefix "ab" matches suffix "ab" +//! ``` +//! +//! ### Stage 2: Byte-Level Transition Table +//! +//! From the failure function, we build a full `(state × byte) → state` transition +//! table. State `i` means "we have matched `needle[0..i]`". State `n` (= needle +//! length) is the **accept** state. +//! +//! ```text +//! Needle: "aba" (3 states + accept) +//! +//! Input byte +//! State 'a' 'b' other +//! ───── ──── ──── ───── +//! 0 1 0 0 ← looking for first 'a' +//! 1 1 2 0 ← matched "a", want 'b' +//! 2 3✓ 0 0 ← matched "ab", want 'a' +//! 3✓ 3✓ 3✓ 3✓ ← accept (sticky) +//! ``` +//! +//! For prefix matching, a mismatch at any state goes to a **fail** state (no +//! fallback). For contains matching, mismatches follow KMP fallback transitions +//! so we can find the needle anywhere in the string. +//! +//! ### Stage 3: Symbol-Level Transition Table +//! +//! FSST symbols can be multi-byte. To compute the transition for symbol code `c` +//! in state `s`, we simulate feeding each byte of the symbol through the byte +//! table: +//! +//! ```text +//! Symbol #42 = "the" (3 bytes) +//! State 0 + 't' → 0, + 'h' → 0, + 'e' → 0 ⟹ sym_trans[0][42] = 0 +//! +//! If needle = "them": +//! State 0 + 't' → 1, + 'h' → 2, + 'e' → 3 ⟹ sym_trans[0][42] = 3 +//! ``` +//! +//! We then build a **fused 256-wide table**: for code bytes 0–254, use the +//! symbol transition; for code byte 255 (ESCAPE_CODE), transition to a +//! special sentinel that tells the scanner to read the next literal byte. +//! +//! ### Stage 4: Flat `u8` Table +//! +//! The fused table is stored as a flat `Vec` indexed as +//! `transitions[state * 256 + byte]`. Both the prefix and contains DFAs use +//! escape-sentinel handling: when the scanner sees the sentinel value, it reads +//! the next byte from a separate byte-level escape table. +//! +//! TODO(joe): for short contains needles (≤7 bytes), a branchless escape-folded +//! DFA with hierarchical 4-byte composition is ~2x faster. For needles ≤127 +//! bytes, an escape-folded flat DFA (2N+1 states) avoids the sentinel branch. +//! See commit 7faf9f36f for those implementations. +//! +//! ## State-Space Limits +//! +//! The public behavior is shaped by two implementation limits, both measured in +//! pattern **bytes** rather than Unicode scalar values: +//! +//! - `prefix%` pushdown is limited to **253 bytes**. The flat prefix DFA uses +//! `u8` state ids and needs room for progress states, an accept state, a +//! fail state, and one escape sentinel (N+3 ≤ 256). +//! - `%needle%` pushdown is limited to **254 bytes**. The contains DFA stores +//! states in `u8`, so it needs room for every match-progress state plus both +//! the accept state and the escape sentinel. +//! +//! Patterns beyond those limits are still valid LIKE patterns; they simply do +//! not use FSST pushdown and must be evaluated through the fallback path. + +mod flat_contains; +mod prefix; +#[cfg(test)] +mod tests; + +use flat_contains::FlatContainsDfa; +use fsst::ESCAPE_CODE; +use fsst::Symbol; +use prefix::FlatPrefixDfa; +use vortex_buffer::BitBuffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +// --------------------------------------------------------------------------- +// FsstMatcher — unified public API +// --------------------------------------------------------------------------- + +/// A compiled matcher for LIKE patterns on FSST-compressed strings. +/// +/// Encapsulates pattern parsing and DFA variant selection. Returns `None` from +/// [`try_new`](Self::try_new) for patterns that cannot be evaluated without +/// decompression (e.g., `_` wildcards, multiple `%` in non-standard positions, +/// or patterns that exceed the DFA's representable byte-length limits). +pub(crate) struct FsstMatcher { + inner: MatcherInner, +} + +enum MatcherInner { + MatchAll, + Prefix(FlatPrefixDfa), + Contains(FlatContainsDfa), +} + +impl FsstMatcher { + /// Try to build a matcher for the given LIKE pattern. + /// + /// Returns `Ok(None)` if the pattern shape is not supported for pushdown + /// (e.g. `_` wildcards, multiple non-bookend `%`, `prefix%` longer than + /// 253 bytes, or `%needle%` longer than 254 bytes). + pub(crate) fn try_new( + symbols: &[Symbol], + symbol_lengths: &[u8], + pattern: &str, + ) -> VortexResult> { + let Some(like_kind) = LikeKind::parse(pattern) else { + return Ok(None); + }; + + let inner = match like_kind { + LikeKind::Prefix("") => MatcherInner::MatchAll, + LikeKind::Prefix(prefix) => { + let prefix = prefix.as_bytes(); + if prefix.len() > FlatPrefixDfa::MAX_PREFIX_LEN { + return Ok(None); + } + MatcherInner::Prefix(FlatPrefixDfa::new(symbols, symbol_lengths, prefix)?) + } + LikeKind::Contains(needle) => { + let needle = needle.as_bytes(); + if needle.len() > FlatContainsDfa::MAX_NEEDLE_LEN { + return Ok(None); + } + MatcherInner::Contains(FlatContainsDfa::new(symbols, symbol_lengths, needle)?) + } + }; + + Ok(Some(Self { inner })) + } + + /// Run the matcher on a single FSST-compressed code sequence. + #[inline] + pub(crate) fn matches(&self, codes: &[u8]) -> bool { + match &self.inner { + MatcherInner::MatchAll => true, + MatcherInner::Prefix(dfa) => dfa.matches(codes), + MatcherInner::Contains(dfa) => dfa.matches(codes), + } + } +} + +/// The subset of LIKE patterns we can handle without decompression. +enum LikeKind<'a> { + /// `prefix%` + Prefix(&'a str), + /// `%needle%` + Contains(&'a str), +} + +impl<'a> LikeKind<'a> { + fn parse(pattern: &'a str) -> Option { + // `prefix%` (including just `%` where prefix is empty) + if let Some(prefix) = pattern.strip_suffix('%') + && !prefix.contains(['%', '_']) + { + return Some(LikeKind::Prefix(prefix)); + } + + // `%needle%` + let inner = pattern.strip_prefix('%')?.strip_suffix('%')?; + if !inner.contains(['%', '_']) { + return Some(LikeKind::Contains(inner)); + } + + None + } +} + +// --------------------------------------------------------------------------- +// Scan helper +// --------------------------------------------------------------------------- + +// TODO: add N-way ILP overrun scan for higher throughput on short strings. +#[inline] +pub(crate) fn dfa_scan_to_bitbuf( + n: usize, + offsets: &[T], + all_bytes: &[u8], + negated: bool, + matcher: F, +) -> BitBuffer +where + T: vortex_array::dtype::IntegerPType, + F: Fn(&[u8]) -> bool, +{ + let mut start: usize = offsets[0].as_(); + BitBuffer::collect_bool(n, |i| { + let end: usize = offsets[i + 1].as_(); + let result = matcher(&all_bytes[start..end]) != negated; + start = end; + result + }) +} + +// --------------------------------------------------------------------------- +// DFA construction helpers +// --------------------------------------------------------------------------- + +/// Builds the per-symbol transition table for FSST symbols. +/// +/// For each `(state, symbol_code)` pair, simulates feeding the symbol's bytes +/// through the byte-level transition table to compute the resulting state. +/// +/// Returns a flat `Vec` indexed as `[state * n_symbols + code]`. +fn build_symbol_transitions( + symbols: &[Symbol], + symbol_lengths: &[u8], + byte_table: &[u8], + n_states: u8, + accept_state: u8, +) -> Vec { + let n_symbols = symbols.len(); + let mut sym_trans = vec![0u8; n_states as usize * n_symbols]; + for state in 0..n_states { + for code in 0..n_symbols { + if state == accept_state { + sym_trans[state as usize * n_symbols + code] = accept_state; + continue; + } + let sym = symbols[code].to_u64().to_le_bytes(); + let sym_len = usize::from(symbol_lengths[code]); + let mut s = state; + for &b in &sym[..sym_len] { + if s == accept_state { + break; + } + s = byte_table[s as usize * 256 + b as usize]; + } + sym_trans[state as usize * n_symbols + code] = s; + } + } + sym_trans +} + +/// Builds a fused 256-wide transition table from symbol transitions. +/// +/// For each `(state, code_byte)`: +/// - Code bytes `0..n_symbols`: use the symbol transition +/// - `ESCAPE_CODE`: maps to `escape_value` (either a sentinel or escape state) +/// - All others: use `default` (typically 0 for contains, fail_state for prefix) +/// +/// Returns a flat `Vec` indexed as `[state * 256 + code_byte]`. +fn build_fused_table( + sym_trans: &[u8], + n_symbols: usize, + n_states: u8, + escape_value_fn: impl Fn(u8) -> u8, + default: u8, +) -> Vec { + let mut fused = vec![default; usize::from(n_states) * 256]; + for state in 0..n_states { + let s = usize::from(state); + for code in 0..n_symbols { + fused[s * 256 + code] = sym_trans[s * n_symbols + code]; + } + fused[s * 256 + usize::from(ESCAPE_CODE)] = escape_value_fn(state); + } + fused +} + +// --------------------------------------------------------------------------- +// KMP helpers +// --------------------------------------------------------------------------- + +fn kmp_byte_transitions(needle: &[u8]) -> Vec { + let n_states = u8::try_from(needle.len() + 1) + .vortex_expect("kmp_byte_transitions: must have needle.len() ≤ 255"); + let accept = n_states - 1; + let failure = kmp_failure_table(needle); + + let mut table = vec![0u8; n_states as usize * 256]; + for state in 0..n_states { + for byte in 0..256usize { + if state == accept { + table[state as usize * 256 + byte] = accept; + continue; + } + let mut s = state; + loop { + if byte == usize::from(needle[usize::from(s)]) { + s += 1; + break; + } + if s == 0 { + break; + } + s = failure[usize::from(s) - 1]; + } + table[state as usize * 256 + byte] = s; + } + } + table +} + +fn kmp_failure_table(needle: &[u8]) -> Vec { + let mut failure = vec![0u8; needle.len()]; + let mut k = 0u8; + for i in 1..needle.len() { + while k > 0 && needle[usize::from(k)] != needle[i] { + k = failure[usize::from(k) - 1]; + } + if needle[usize::from(k)] == needle[i] { + k += 1; + } + failure[i] = k; + } + failure +} diff --git a/encodings/fsst/src/dfa/prefix.rs b/encodings/fsst/src/dfa/prefix.rs new file mode 100644 index 00000000000..20a07c2aaa3 --- /dev/null +++ b/encodings/fsst/src/dfa/prefix.rs @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Flat `u8` transition table DFA for prefix matching (`LIKE 'prefix%'`). +//! +//! Supports prefixes up to 253 bytes (states: 0..N progress + accept + fail + +//! sentinel ≤ 256). +//! +//! TODO(joe): for short prefixes (≤13 bytes), a shift-packed `[u64; 256]` +//! representation would be simpler and easier to read — all state transitions +//! for one input byte fit in a single `u64`. Benchmarks showed no meaningful +//! perf difference (see `benches/BENCH_RESULTS.md`), so we use flat-only for +//! now to keep the code simple and support long prefixes. + +use fsst::Symbol; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; +use vortex_error::vortex_bail; + +use super::build_fused_table; +use super::build_symbol_transitions; + +/// Flat `u8` transition table DFA for prefix matching on FSST codes. +/// +/// States 0..prefix_len track match progress, plus ACCEPT, FAIL, and an +/// escape SENTINEL. Transitions are stored in a flat `Vec` indexed as +/// `[state * 256 + byte]`. +/// +/// ```text +/// Prefix: "http" (4 progress states + accept + fail) +/// +/// Input byte +/// State 'h' 't' 'p' other +/// ───── ──── ──── ──── ───── +/// 0 1 F F F ← want 'h' +/// 1 F 2 F F ← want 't' +/// 2 F 3 F F ← want 't' +/// 3 F F 4✓ F ← want 'p' +/// 4✓ 4✓ 4✓ 4✓ 4✓ ← accept (sticky) +/// F F F F F ← fail (sticky) +/// +/// Escape handling: code 255 → sentinel → read next literal byte → byte table +/// ``` +pub(crate) struct FlatPrefixDfa { + /// `transitions[state * 256 + byte]` -> next state. + transitions: Vec, + /// `escape_transitions[state * 256 + byte]` -> next state for escaped bytes. + escape_transitions: Vec, + accept_state: u8, + fail_state: u8, + sentinel: u8, +} + +impl FlatPrefixDfa { + pub(crate) const MAX_PREFIX_LEN: usize = (u8::MAX - 2) as usize; + + pub(crate) fn new( + symbols: &[Symbol], + symbol_lengths: &[u8], + prefix: &[u8], + ) -> VortexResult { + if prefix.len() > Self::MAX_PREFIX_LEN { + vortex_bail!( + "prefix length {} exceeds maximum {} for flat prefix DFA", + prefix.len(), + Self::MAX_PREFIX_LEN + ); + } + + let accept_state = u8::try_from(prefix.len()).vortex_expect("prefix fits in u8"); + let fail_state = accept_state + 1; + let n_states = fail_state + 1; + let sentinel = fail_state + 1; + + // Step 1: byte-level transitions + let byte_table = build_prefix_byte_table(prefix, accept_state, fail_state); + + // Step 2: symbol-level transitions + let sym_trans = + build_symbol_transitions(symbols, symbol_lengths, &byte_table, n_states, accept_state); + + // Step 3: fused table with escape sentinel + let transitions = build_fused_table( + &sym_trans, + symbols.len(), + n_states, + |_| sentinel, + fail_state, + ); + + Ok(Self { + transitions, + escape_transitions: byte_table, + accept_state, + fail_state, + sentinel, + }) + } + + #[inline] + pub(crate) fn matches(&self, codes: &[u8]) -> bool { + let mut state = 0u8; + let mut pos = 0; + while pos < codes.len() { + let code = codes[pos]; + pos += 1; + let next = self.transitions[usize::from(state) * 256 + usize::from(code)]; + if next == self.sentinel { + if pos >= codes.len() { + return false; + } + let b = codes[pos]; + pos += 1; + state = self.escape_transitions[usize::from(state) * 256 + usize::from(b)]; + } else { + state = next; + } + if state == self.accept_state { + return true; + } + if state == self.fail_state { + return false; + } + } + state == self.accept_state + } +} + +/// Build a byte-level transition table for prefix matching (no KMP fallback). +/// +/// For each state, only the correct next byte advances; everything else goes +/// to the fail state. +fn build_prefix_byte_table(prefix: &[u8], accept_state: u8, fail_state: u8) -> Vec { + let n_states = fail_state + 1; + let mut table = vec![fail_state; usize::from(n_states) * 256]; + + for state in 0..n_states { + let s = usize::from(state); + if state == accept_state { + for byte in 0..256 { + table[s * 256 + byte] = accept_state; + } + } else if state != fail_state { + // Only the correct next byte advances; everything else fails. + let next_byte = prefix[s]; + let next_state = if s + 1 >= prefix.len() { + accept_state + } else { + state + 1 + }; + table[s * 256 + usize::from(next_byte)] = next_state; + } + } + table +} diff --git a/encodings/fsst/src/dfa/tests.rs b/encodings/fsst/src/dfa/tests.rs new file mode 100644 index 00000000000..8b2a99aa4c9 --- /dev/null +++ b/encodings/fsst/src/dfa/tests.rs @@ -0,0 +1,184 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use fsst::ESCAPE_CODE; +use fsst::Symbol; +use vortex_error::VortexResult; + +use super::FsstMatcher; +use super::LikeKind; +use super::flat_contains::FlatContainsDfa; +use super::prefix::FlatPrefixDfa; + +/// Helper: make a Symbol from a byte string (up to 8 bytes, zero-padded). +fn sym(bytes: &[u8]) -> Symbol { + let mut buf = [0u8; 8]; + buf[..bytes.len()].copy_from_slice(bytes); + Symbol::from_slice(&buf) +} + +fn escaped(bytes: &[u8]) -> Vec { + let mut codes = Vec::with_capacity(bytes.len() * 2); + for &b in bytes { + codes.push(ESCAPE_CODE); + codes.push(b); + } + codes +} + +#[test] +fn test_like_kind_parse() { + assert!(matches!( + LikeKind::parse("http%"), + Some(LikeKind::Prefix("http")) + )); + assert!(matches!( + LikeKind::parse("%needle%"), + Some(LikeKind::Contains("needle")) + )); + assert!(matches!(LikeKind::parse("%"), Some(LikeKind::Prefix("")))); + // Suffix and underscore patterns are not supported. + assert!(LikeKind::parse("%suffix").is_none()); + assert!(LikeKind::parse("a_c").is_none()); +} + +/// No symbols — all bytes escaped. Simplest case to see the two tables. +#[test] +fn test_prefix_dfa_no_symbols() -> VortexResult<()> { + let dfa = FlatPrefixDfa::new(&[], &[], b"ab")?; + + assert!(dfa.matches(&escaped(b"abx"))); + assert!(dfa.matches(&escaped(b"ab"))); + assert!(!dfa.matches(&escaped(b"a"))); + assert!(!dfa.matches(&escaped(b"ax"))); + assert!(!dfa.matches(&escaped(b"ba"))); + assert!(!dfa.matches(&[])); + + Ok(()) +} + +/// With symbols — shows how multi-byte symbols interact with prefix matching. +/// +/// Symbol table: code 0 = "ht", code 1 = "tp" +/// Prefix: "http" +/// +/// The string "http" can be encoded as: +/// [0, 1] — two symbols: "ht" + "tp" +/// [ESC,h, ESC,t, ESC,t, ESC,p] — all escaped +/// [0, ESC,t, ESC,p] — symbol "ht" + escaped "t" + escaped "p" +#[test] +fn test_prefix_dfa_with_symbols() -> VortexResult<()> { + let symbols = [sym(b"ht"), sym(b"tp")]; + let lengths = [2u8, 2]; + let dfa = FlatPrefixDfa::new(&symbols, &lengths, b"http")?; + + // "http" via two symbols: code 0 ("ht") + code 1 ("tp") → accept + assert!(dfa.matches(&[0, 1])); + + // "http" all escaped + assert!(dfa.matches(&escaped(b"http"))); + + // "http" mixed: symbol "ht" + escaped "tp" + assert!(dfa.matches(&[0, ESCAPE_CODE, b't', ESCAPE_CODE, b'p'])); + + // "htxx" via symbol "ht" + escaped "xx" → fail after "ht" advances to state 2, + // then 'x' doesn't match 't' + assert!(!dfa.matches(&[0, ESCAPE_CODE, b'x', ESCAPE_CODE, b'x'])); + + // "tp" alone → symbol "tp" from state 0 feeds 't','p' through byte table: + // state 0 wants 'h', sees 't' → fail + assert!(!dfa.matches(&[1])); + + Ok(()) +} + +/// Longer prefix showing more progress states. +#[test] +fn test_prefix_dfa_longer() -> VortexResult<()> { + // code 0 = "tp" (2 bytes), code 1 = "htt" (3 bytes), code 2 = "p:/" (3 bytes) + let symbols = [sym(b"tp"), sym(b"htt"), sym(b"p:/")]; + let lengths = [2u8, 3, 3]; + let dfa = FlatPrefixDfa::new(&symbols, &lengths, b"http://")?; + + // "http://e" via symbols: "htt"(1) + "p:/"(2) + escaped "/" + escaped "e" + // "htt" = states 0→1→2→3, "p:/" = states 3→4→5→6, "/" = state 6→accept + assert!(dfa.matches(&[1, 2, ESCAPE_CODE, b'/', ESCAPE_CODE, b'e'])); + + // "http:/" — 6 chars, missing the 7th '/' + assert!(!dfa.matches(&[1, ESCAPE_CODE, b'p', ESCAPE_CODE, b':', ESCAPE_CODE, b'/',])); + + // "http://" all escaped — 7 chars, exact match + assert!(dfa.matches(&escaped(b"http://"))); + + // "tp" alone (code 0) from state 0: feeds 't','p' → state 0 wants 'h', sees 't' → fail + assert!(!dfa.matches(&[0])); + + // "htt" + "tp" = "httpp"? No — "htt" → states 0→1→2→3, then "tp": + // state 3 wants 'p', sees 't' → fail immediately + assert!(!dfa.matches(&[1, 0])); + + Ok(()) +} + +#[test] +fn test_prefix_pushdown_len_13_with_escapes() { + let matcher = FsstMatcher::try_new(&[], &[], "abcdefghijklm%") + .unwrap() + .unwrap(); + + assert!(matcher.matches(&escaped(b"abcdefghijklm"))); + assert!(!matcher.matches(&escaped(b"abcdefghijklx"))); +} + +#[test] +fn test_prefix_pushdown_len_14_now_handled() { + // 14-byte prefix is now handled by FlatPrefixDfa (was rejected by shift-packed). + assert!( + FsstMatcher::try_new(&[], &[], "abcdefghijklmn%") + .unwrap() + .is_some() + ); +} + +#[test] +fn test_prefix_pushdown_long_prefix() -> VortexResult<()> { + let prefix = "a".repeat(FlatPrefixDfa::MAX_PREFIX_LEN); + let pattern = format!("{prefix}%"); + let matcher = FsstMatcher::try_new(&[], &[], &pattern)?.unwrap(); + + assert!(matcher.matches(&escaped(prefix.as_bytes()))); + + let mut mismatch = prefix.into_bytes(); + mismatch[FlatPrefixDfa::MAX_PREFIX_LEN - 1] = b'b'; + assert!(!matcher.matches(&escaped(&mismatch))); + + Ok(()) +} + +#[test] +fn test_prefix_pushdown_rejects_len_254() { + debug_assert_eq!(FlatPrefixDfa::MAX_PREFIX_LEN, 253); + let prefix = "a".repeat(254); + let pattern = format!("{prefix}%"); + assert!(FsstMatcher::try_new(&[], &[], &pattern).unwrap().is_none()); +} + +#[test] +fn test_contains_pushdown_len_254_with_escapes() { + let needle = "a".repeat(FlatContainsDfa::MAX_NEEDLE_LEN); + let pattern = format!("%{needle}%"); + let matcher = FsstMatcher::try_new(&[], &[], &pattern).unwrap().unwrap(); + + assert!(matcher.matches(&escaped(needle.as_bytes()))); + + let mut mismatch = needle.into_bytes(); + mismatch[FlatContainsDfa::MAX_NEEDLE_LEN - 1] = b'b'; + assert!(!matcher.matches(&escaped(&mismatch))); +} + +#[test] +fn test_contains_pushdown_rejects_len_255() { + let needle = "a".repeat(FlatContainsDfa::MAX_NEEDLE_LEN + 1); + let pattern = format!("%{needle}%"); + assert!(FsstMatcher::try_new(&[], &[], &pattern).unwrap().is_none()); +} diff --git a/encodings/fsst/src/kernel.rs b/encodings/fsst/src/kernel.rs index 8d2a08fba2b..3ec36dd1b32 100644 --- a/encodings/fsst/src/kernel.rs +++ b/encodings/fsst/src/kernel.rs @@ -5,6 +5,7 @@ use vortex_array::arrays::dict::TakeExecuteAdaptor; use vortex_array::arrays::filter::FilterExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; +use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor; use crate::FSST; @@ -12,6 +13,7 @@ pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ ParentKernelSet::lift(&CompareExecuteAdaptor(FSST)), ParentKernelSet::lift(&FilterExecuteAdaptor(FSST)), ParentKernelSet::lift(&TakeExecuteAdaptor(FSST)), + ParentKernelSet::lift(&LikeExecuteAdaptor(FSST)), ]); #[cfg(test)] diff --git a/encodings/fsst/src/lib.rs b/encodings/fsst/src/lib.rs index 5cc75c59b2a..3305c0e66fc 100644 --- a/encodings/fsst/src/lib.rs +++ b/encodings/fsst/src/lib.rs @@ -15,6 +15,7 @@ mod array; mod canonical; mod compress; mod compute; +mod dfa; mod kernel; mod ops; mod rules; diff --git a/fuzz/Cargo.toml b/fuzz/Cargo.toml index b80a00d66fa..e2d05b706f9 100644 --- a/fuzz/Cargo.toml +++ b/fuzz/Cargo.toml @@ -36,6 +36,7 @@ vortex-array = { workspace = true, features = ["arbitrary", "_test-harness"] } vortex-btrblocks = { workspace = true } vortex-buffer = { workspace = true } vortex-error = { workspace = true } +vortex-fsst = { workspace = true } vortex-io = { workspace = true } vortex-mask = { workspace = true } vortex-runend = { workspace = true, features = ["arbitrary"] } @@ -88,6 +89,14 @@ path = "fuzz_targets/compress_roundtrip.rs" test = false required-features = ["native"] +[[bin]] +bench = false +doc = false +name = "fsst_like" +path = "fuzz_targets/fsst_like.rs" +test = false +required-features = ["native"] + [[bin]] bench = false doc = false diff --git a/fuzz/fuzz_targets/fsst_like.rs b/fuzz/fuzz_targets/fsst_like.rs new file mode 100644 index 00000000000..8e03badff00 --- /dev/null +++ b/fuzz/fuzz_targets/fsst_like.rs @@ -0,0 +1,35 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![no_main] +#![allow(clippy::unwrap_used, clippy::result_large_err)] + +use std::str::FromStr; + +use libfuzzer_sys::Corpus; +use libfuzzer_sys::fuzz_target; +use tracing::level_filters::LevelFilter; +use vortex_error::vortex_panic; +use vortex_fuzz::FuzzFsstLike; +use vortex_fuzz::run_fsst_like_fuzz; + +fuzz_target!( + init: { + let fmt = tracing_subscriber::fmt::format() + .with_ansi(false) + .without_time() + .compact(); + let level = std::env::var("RUST_LOG").map( + |v| LevelFilter::from_str(v.as_str()).unwrap()).unwrap_or(LevelFilter::INFO); + tracing_subscriber::fmt() + .event_format(fmt) + .with_max_level(level) + .init(); + }, + |fuzz_action: FuzzFsstLike| -> Corpus { + match run_fsst_like_fuzz(fuzz_action) { + Ok(true) => Corpus::Keep, + Ok(false) => Corpus::Reject, + Err(e) => vortex_panic!("{e}"), + } +}); diff --git a/fuzz/src/fsst_like.rs b/fuzz/src/fsst_like.rs new file mode 100644 index 00000000000..866b078eae0 --- /dev/null +++ b/fuzz/src/fsst_like.rs @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Fuzzer for FSST LIKE pushdown: compresses arbitrary strings with FSST, then +//! runs a LIKE pattern on both the compressed and uncompressed arrays, asserting +//! that the boolean results are identical. + +use std::sync::LazyLock; + +use arbitrary::Arbitrary; +use arbitrary::Unstructured; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::session::ArraySession; +use vortex_error::VortexResult; +use vortex_fsst::FSSTArray; +use vortex_fsst::fsst_compress; +use vortex_fsst::fsst_train_compressor; +use vortex_session::VortexSession; + +use crate::error::Backtrace; +use crate::error::VortexFuzzError; +use crate::error::VortexFuzzResult; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +/// Fuzz input: a set of strings and a LIKE pattern. +#[derive(Debug)] +pub struct FuzzFsstLike { + pub strings: Vec, + pub pattern: String, + pub negated: bool, +} + +impl<'a> Arbitrary<'a> for FuzzFsstLike { + fn arbitrary(u: &mut Unstructured<'a>) -> arbitrary::Result { + // Generate 1-200 strings, each 0-100 bytes from a small alphabet + // to increase FSST symbol reuse and substring hits. + let n_strings: usize = u.int_in_range(1..=200)?; + let mut strings = Vec::with_capacity(n_strings); + for _ in 0..n_strings { + let len: usize = u.int_in_range(0..=100)?; + let s: String = (0..len) + .map(|_| { + let b = u.int_in_range(b'a'..=b'h').unwrap_or(b'a'); + b as char + }) + .collect(); + strings.push(s); + } + + // Generate a pattern: pick a shape then fill in the literal part. + let needle_len: usize = u.int_in_range(0..=30)?; + let needle: String = (0..needle_len) + .map(|_| { + let b = u.int_in_range(b'a'..=b'h').unwrap_or(b'a'); + b as char + }) + .collect(); + + let pattern = match u.int_in_range(0..=2)? { + 0 => format!("{needle}%"), // prefix + 1 => format!("%{needle}%"), // contains + _ => format!("%{needle}"), // suffix (should fall back, still correct) + }; + + let negated: bool = u.arbitrary()?; + + Ok(FuzzFsstLike { + strings, + pattern, + negated, + }) + } +} + +/// Run the FSST LIKE fuzzer: compare LIKE on compressed vs uncompressed. +/// +/// Returns: +/// - `Ok(true)` — keep in corpus +/// - `Ok(false)` — reject (e.g. too few strings) +/// - `Err(_)` — mismatch found (bug) +#[allow(clippy::result_large_err)] +pub fn run_fsst_like_fuzz(fuzz: FuzzFsstLike) -> VortexFuzzResult { + let FuzzFsstLike { + strings, + pattern, + negated, + } = fuzz; + + if strings.is_empty() { + return Ok(false); + } + + let len = strings.len(); + + // Build uncompressed VarBinArray. + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + + // Train FSST compressor and compress. + let compressor = fsst_train_compressor(&varbin); + let fsst_array: FSSTArray = fsst_compress(varbin.clone(), &compressor); + + let opts = LikeOptions { + negated, + case_insensitive: false, + }; + + // Run LIKE on the uncompressed array. + let expected = run_like_on_array(varbin.into_array().as_ref(), &pattern, len, opts) + .map_err(|err| VortexFuzzError::VortexError(err, Backtrace::capture()))?; + + // Run LIKE on the FSST-compressed array. + let actual = run_like_on_array(fsst_array.into_array().as_ref(), &pattern, len, opts) + .map_err(|err| VortexFuzzError::VortexError(err, Backtrace::capture()))?; + + // Compare bit-for-bit. + let expected_bits = expected.to_bit_buffer(); + let actual_bits = actual.to_bit_buffer(); + for idx in 0..len { + let expected_val = expected_bits.value(idx); + let actual_val = actual_bits.value(idx); + if expected_val != actual_val { + return Err(VortexFuzzError::ScalarMismatch( + expected_val.into(), + actual_val.into(), + idx, + Backtrace::capture(), + )); + } + } + + Ok(true) +} + +fn run_like_on_array( + array: &dyn vortex_array::DynArray, + pattern: &str, + len: usize, + opts: LikeOptions, +) -> VortexResult { + use vortex_array::ArrayRef; + use vortex_array::arrays::scalar_fn::ScalarFnArrayExt; + + let arr: ArrayRef = array.to_array(); + let pattern_arr = ConstantArray::new(pattern, len).into_array(); + let result = Like + .try_new_array(len, opts, [arr, pattern_arr])? + .into_array() + .execute::(&mut SESSION.create_execution_ctx())?; + Ok(result.into_bool()) +} diff --git a/fuzz/src/lib.rs b/fuzz/src/lib.rs index 1d117e6d113..910aa1bdc0c 100644 --- a/fuzz/src/lib.rs +++ b/fuzz/src/lib.rs @@ -6,6 +6,7 @@ mod array; pub mod compress; pub mod error; +pub mod fsst_like; // File module only available for native builds (requires vortex-file which uses tokio) #[cfg(not(target_arch = "wasm32"))] @@ -24,6 +25,8 @@ pub use compress::FuzzCompressRoundtrip; pub use compress::run_compress_roundtrip; #[cfg(not(target_arch = "wasm32"))] pub use file::FuzzFileAction; +pub use fsst_like::FuzzFsstLike; +pub use fsst_like::run_fsst_like_fuzz; #[cfg(feature = "cuda")] pub use gpu::FuzzCompressGpu; #[cfg(feature = "cuda")]