Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/workflows/fuzz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,42 @@ jobs:
gh_token: ${{ secrets.GITHUB_TOKEN }}
incident_io_alert_token: ${{ secrets.INCIDENT_IO_ALERT_TOKEN }}

# ============================================================================
# FSST LIKE Fuzzer
# ============================================================================
fsst_like_fuzz:
name: "FSST LIKE Fuzz"
uses: ./.github/workflows/run-fuzzer.yml
with:
fuzz_target: fsst_like
jobs: 16
secrets:
R2_FUZZ_ACCESS_KEY_ID: ${{ secrets.R2_FUZZ_ACCESS_KEY_ID }}
R2_FUZZ_SECRET_ACCESS_KEY: ${{ secrets.R2_FUZZ_SECRET_ACCESS_KEY }}

report-fsst-like-fuzz-failures:
name: "Report FSST LIKE Fuzz Failures"
needs: fsst_like_fuzz
if: always() && needs.fsst_like_fuzz.outputs.crashes_found == 'true'
permissions:
issues: write
contents: read
id-token: write
pull-requests: read
uses: ./.github/workflows/report-fuzz-crash.yml
with:
fuzz_target: fsst_like
crash_file: ${{ needs.fsst_like_fuzz.outputs.first_crash_name }}
artifact_url: ${{ needs.fsst_like_fuzz.outputs.artifact_url }}
artifact_name: fsst_like-crash-artifacts
logs_artifact_name: fsst_like-logs
branch: ${{ github.ref_name }}
commit: ${{ github.sha }}
secrets:
claude_code_oauth_token: ${{ secrets.CLAUDE_CODE_OAUTH_TOKEN }}
gh_token: ${{ secrets.GITHUB_TOKEN }}
incident_io_alert_token: ${{ secrets.INCIDENT_IO_ALERT_TOKEN }}

# ============================================================================
# Compress Roundtrip Fuzzer
# ============================================================================
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion encodings/fsst/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ harness = false
required-features = ["_test-harness"]

[[bench]]
name = "fsst_contains"
name = "fsst_like"
harness = false
required-features = ["_test-harness"]

Expand Down
20 changes: 20 additions & 0 deletions encodings/fsst/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,23 @@

A Vortex Encoding for Binary and Utf8 data that utilizes the [Fast Static Symbol Table](https://github.com/spiraldb/fsst)
compression algorithm.

## LIKE Pushdown

The FSST encoding has a specialized LIKE fast path for a narrow subset of
patterns:

- `prefix%`
- `%needle%`

Unsupported shapes, including `_`, `%suffix`, or patterns with interior
wildcards, fall back to ordinary decompression-based LIKE evaluation.

There are also two implementation limits on the pushdown path, both measured in
pattern bytes:

- `prefix%` supports up to 253 bytes.
- `%needle%` supports up to 254 bytes.

Patterns beyond those limits are still evaluated correctly, but they do so via
the fallback path instead of the DFA matcher.
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,19 @@ impl Dataset {
}
}

fn pattern(&self) -> &'static str {
fn prefix_pattern(&self) -> &'static str {
match self {
Self::Urls => "https%",
Self::Cb => "https://www.%",
Self::Log => "192.168%",
Self::Json => r#"{"id%"#,
Self::Path => "/home%",
Self::Email => "john%",
Self::Rare => "xyz%",
}
}

fn contains_pattern(&self) -> &'static str {
match self {
Self::Urls => "%google%",
Self::Cb => "%yandex%",
Expand All @@ -93,15 +105,10 @@ impl Dataset {
}
}

#[divan::bench(args = [
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
Dataset::Path, Dataset::Email, Dataset::Rare,
])]
fn fsst_like(bencher: Bencher, dataset: &Dataset) {
let fsst = dataset.fsst_array();
fn bench_like(bencher: Bencher, fsst: &FSSTArray, pattern: &str) {
let len = fsst.len();
let arr = fsst.clone().into_array();
let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
let pattern = ConstantArray::new(pattern, len).into_array();
bencher.bench_local(|| {
Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
.unwrap()
Expand All @@ -110,3 +117,19 @@ fn fsst_like(bencher: Bencher, dataset: &Dataset) {
.unwrap()
});
}

#[divan::bench(args = [
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
Dataset::Path, Dataset::Email, Dataset::Rare,
])]
fn fsst_prefix(bencher: Bencher, dataset: &Dataset) {
bench_like(bencher, dataset.fsst_array(), dataset.prefix_pattern());
}

#[divan::bench(args = [
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
Dataset::Path, Dataset::Email, Dataset::Rare,
])]
fn fsst_contains(bencher: Bencher, dataset: &Dataset) {
bench_like(bencher, dataset.fsst_array(), dataset.contains_pattern());
}
4 changes: 4 additions & 0 deletions encodings/fsst/public-api.lock
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ impl vortex_array::scalar_fn::fns::cast::kernel::CastReduce for vortex_fsst::FSS

pub fn vortex_fsst::FSST::cast(array: &vortex_fsst::FSSTArray, dtype: &vortex_array::dtype::DType) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>

impl vortex_array::scalar_fn::fns::like::kernel::LikeKernel for vortex_fsst::FSST

pub fn vortex_fsst::FSST::like(array: &vortex_fsst::FSSTArray, pattern: &vortex_array::array::ArrayRef, options: vortex_array::scalar_fn::fns::like::LikeOptions, _ctx: &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::ArrayRef>>

impl vortex_array::vtable::VTable for vortex_fsst::FSST

pub type vortex_fsst::FSST::Array = vortex_fsst::FSSTArray
Expand Down
Loading
Loading