Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cedardb-parquet/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@
# Thin shim — actual flow is in lib/benchmark-common.sh.
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
export BENCH_DURABLE=yes
export PGHOST="/tmp"
export PGUSER=postgres
export PGDATABASE=postgres

exec ../lib/benchmark-common.sh
2 changes: 1 addition & 1 deletion cedardb-parquet/check
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
set -e

PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null
psql -c 'SELECT 1' >/dev/null
2 changes: 1 addition & 1 deletion cedardb-parquet/create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -105,4 +105,4 @@ SELECT
"RefererHash" AS RefererHash,
"URLHash" AS URLHash,
"CLID" AS CLID
FROM '/data/hits.parquet';
FROM 'hits.parquet';
2 changes: 1 addition & 1 deletion cedardb-parquet/data-size
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
set -eu

# No ingestion — reported size is the parquet file itself.
stat -c%s data/hits.parquet
stat -c%s hits.parquet
14 changes: 10 additions & 4 deletions cedardb-parquet/install
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
set -eu

sudo apt-get update -y
sudo apt-get install -y docker.io postgresql-client
sudo apt-get install -y postgresql-client

sudo docker pull cedardb/cedardb:latest
# Stop any running instance before reinstalling
./stop || true

mkdir -p data db
chmod -R 777 data db
# Delete potential previously created database
rm -rf ./db

curl https://get.cedardb.com | bash -s -- -y \
--install-dir "$PWD" \
--db-dir "$PWD/db" \
--with-systemd=system
10 changes: 3 additions & 7 deletions cedardb-parquet/load
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
#!/bin/bash
set -eu

# Stage parquet file under ./data so the docker container sees /data/hits.parquet.
mkdir -p data
mv hits.parquet data/
chmod -R 777 data

# create.sql defines a view over the parquet file — no ingestion needed.
PGPASSWORD=test psql -h localhost -U postgres -t < create.sql
# Substitute the absolute path so CedarDB can find the file regardless of
# its working directory.
sed "s|'hits\.parquet'|'$PWD/hits.parquet'|" create.sql | psql

sync
2 changes: 1 addition & 1 deletion cedardb-parquet/query
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set -e

query=$(cat)

raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?
raw=$(psql -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?

if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then
printf '%s\n' "$raw" >&2
Expand Down
40 changes: 1 addition & 39 deletions cedardb-parquet/start
Original file line number Diff line number Diff line change
@@ -1,42 +1,4 @@
#!/bin/bash
set -eu

if PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null 2>&1; then
exit 0
fi

# After a VM snapshot+restore, dockerd's in-memory networking/cgroup state
# is out of sync with the (also-restored) kernel-side resources, and the
# next `docker run` either fails or starts a container that can't be
# reached on its mapped port. Restarting dockerd reconciles it.
sudo systemctl restart docker
for _ in $(seq 1 30); do
sudo docker info >/dev/null 2>&1 && break
sleep 1
done

sudo docker stop cedardb >/dev/null 2>&1 || true
sudo docker rm cedardb >/dev/null 2>&1 || true

if ! sudo docker run -d --rm -p 5432:5432 \
-v "$(pwd)/data:/data" \
-v "$(pwd)/db:/var/lib/cedardb/data" \
-e CEDAR_PASSWORD=test \
--name cedardb cedardb/cedardb:latest; then
echo "docker run failed; ps -a:" >&2
sudo docker ps -a >&2 || true
exit 1
fi

# First-boot initdb inside the container takes well over a minute
# (observed ~90-120 s of "Fixing permissions"/"Setting up database
# directory" before postgres actually listens). Give it 10 min —
# pg_isready exits fast once the daemon is up, so this only
# matters in the failure path.
for _ in $(seq 1 600); do
pg_isready -h localhost --dbname postgres -U postgres >/dev/null 2>&1 && exit 0
sleep 1
done
echo "cedardb did not become ready in 600 s; container logs:" >&2
sudo docker logs cedardb 2>&1 | tail -40 >&2 || true
exit 1
sudo systemctl start cedardb.service
2 changes: 1 addition & 1 deletion cedardb-parquet/stop
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/bash

sudo docker stop cedardb >/dev/null 2>&1 || true
sudo systemctl stop cedardb.service || true
6 changes: 5 additions & 1 deletion cedardb/benchmark.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/bin/bash
# Thin shim — actual flow is in lib/benchmark-common.sh.
export BENCH_DOWNLOAD_SCRIPT="download-hits-tsv"
export BENCH_DOWNLOAD_SCRIPT="download-hits-parquet-single"
export BENCH_DURABLE=yes
export PGHOST="/tmp"
export PGUSER=postgres
export PGDATABASE=postgres

exec ../lib/benchmark-common.sh
2 changes: 1 addition & 1 deletion cedardb/check
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
set -e

PGPASSWORD=test psql -h localhost -U postgres -c 'SELECT 1' >/dev/null
psql -c 'SELECT 1' >/dev/null
3 changes: 1 addition & 2 deletions cedardb/create.sql
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,6 @@ create table hits (
hasgclid smallint not null,
refererhash bigint not null,
urlhash bigint not null,
clid integer not null,
primary key (counterid, eventdate, userid, eventtime, watchid)
clid integer not null
);

2 changes: 1 addition & 1 deletion cedardb/data-size
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/bin/bash
set -eu

PGPASSWORD=test psql -h localhost -U postgres -q -t -A -c "SELECT pg_total_relation_size('hits');"
psql -q -t -A -c "SELECT pg_total_relation_size('hits');"
14 changes: 10 additions & 4 deletions cedardb/install
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@
set -eu

sudo apt-get update -y
sudo apt-get install -y docker.io postgresql-client gzip
sudo apt-get install -y postgresql-client

sudo docker pull cedardb/cedardb:latest
# Stop any running instance before reinstalling
./stop || true

mkdir -p data db
chmod -R 777 data db
# Delete potential previously created database
rm -rf ./db

curl https://get.cedardb.com | bash -s -- -y \
--install-dir "$PWD" \
--db-dir "$PWD/db" \
--with-systemd=system
17 changes: 8 additions & 9 deletions cedardb/load
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#!/bin/bash
set -eu

mkdir -p data
mv hits.tsv data/
chmod -R 777 data

PGPASSWORD=test psql -h localhost -U postgres -t < create.sql

PGPASSWORD=test psql -h localhost -U postgres -q -t -c "COPY hits FROM '/data/hits.tsv';"

rm -f data/hits.tsv
psql -f create.sql
{
# CedarDB would like to have ~4 GB of memory per parallel worker for parquet import to be efficient, so set the number of workers accordingly.
echo "SET debug.parallel = $(( $(grep MemTotal /proc/meminfo | awk '{print $2}') / (4 * 1024 * 1024) ));"
sed "s|'hits\.parquet'|'$PWD/hits.parquet'|" load.sql
} | psql

rm -f hits.parquet
sync
107 changes: 107 additions & 0 deletions cedardb/load.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
INSERT INTO hits SELECT
"WatchID",
"JavaEnable",
"Title",
"GoodEvent",
to_timestamp("EventTime") as "EventTime",
('1970-01-01'::date + "EventDate"::integer) as "EventDate",
"CounterID",
"ClientIP",
"RegionID",
"UserID",
"CounterClass",
"OS",
"UserAgent",
"URL",
"Referer",
"IsRefresh",
"RefererCategoryID",
"RefererRegionID",
"URLCategoryID",
"URLRegionID",
"ResolutionWidth",
"ResolutionHeight",
"ResolutionDepth",
"FlashMajor",
"FlashMinor",
"FlashMinor2",
"NetMajor",
"NetMinor",
"UserAgentMajor",
"UserAgentMinor",
"CookieEnable",
"JavascriptEnable",
"IsMobile",
"MobilePhone",
"MobilePhoneModel",
"Params",
"IPNetworkID",
"TraficSourceID",
"SearchEngineID",
"SearchPhrase",
"AdvEngineID",
"IsArtifical",
"WindowClientWidth",
"WindowClientHeight",
"ClientTimeZone",
to_timestamp("ClientEventTime") as "ClientEventTime",
"SilverlightVersion1",
"SilverlightVersion2",
"SilverlightVersion3",
"SilverlightVersion4",
"PageCharset",
"CodeVersion",
"IsLink",
"IsDownload",
"IsNotBounce",
"FUniqID",
"OriginalURL",
"HID",
"IsOldCounter",
"IsEvent",
"IsParameter",
"DontCountHits",
"WithHash",
"HitColor",
to_timestamp("LocalEventTime") as "LocalEventTime",
"Age",
"Sex",
"Income",
"Interests",
"Robotness",
"RemoteIP",
"WindowName",
"OpenerName",
"HistoryLength",
"BrowserLanguage",
"BrowserCountry",
"SocialNetwork",
"SocialAction",
"HTTPError",
"SendTiming",
"DNSTiming",
"ConnectTiming",
"ResponseStartTiming",
"ResponseEndTiming",
"FetchTiming",
"SocialSourceNetworkID",
"SocialSourcePage",
"ParamPrice",
"ParamOrderID",
"ParamCurrency",
"ParamCurrencyID",
"OpenstatServiceName",
"OpenstatCampaignID",
"OpenstatAdID",
"OpenstatSourceID",
"UTMSource",
"UTMMedium",
"UTMCampaign",
"UTMContent",
"UTMTerm",
"FromTag",
"HasGCLID",
"RefererHash",
"URLHash",
"CLID"
FROM 'hits.parquet';
2 changes: 1 addition & 1 deletion cedardb/query
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ set -e

query=$(cat)

raw=$(PGPASSWORD=test psql -h localhost -U postgres -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?
raw=$(psql -t -c '\timing' -c "$query" 2>&1) && exit_code=0 || exit_code=$?

if [ "$exit_code" -ne 0 ] || printf '%s\n' "$raw" | grep -qE '^ERROR|psql: error'; then
printf '%s\n' "$raw" >&2
Expand Down
58 changes: 0 additions & 58 deletions cedardb/results/20260510/c6a.4xlarge.json

This file was deleted.

Loading