From 3a8484086f9ccd699a07128661b43273cdd413e5 Mon Sep 17 00:00:00 2001 From: anlowee Date: Sun, 27 Jul 2025 18:09:00 +0000 Subject: [PATCH 01/42] Init --- tools/deployment/presto-clp/.env | 35 ++++++++ tools/deployment/presto-clp/README.md | 76 ++++++++++++++++ .../config-template/clp.properties | 10 +++ .../config-template/config.properties | 14 +++ .../coordinator/config-template/jvm.config | 10 +++ .../config-template/log.properties | 2 + .../config-template/metadata-filter.json | 3 + .../config-template/node.properties | 3 + .../presto-clp/coordinator/scripts/$ | 19 ++++ .../coordinator/scripts/generate-configs.sh | 18 ++++ .../presto-clp/demo-assets/clp-config.yml | 39 +++++++++ .../deployment/presto-clp/demo-assets/init.sh | 50 +++++++++++ .../deployment/presto-clp/docker-compose.yaml | 39 +++++++++ .../worker/config-template/clp.properties | 2 + .../worker/config-template/config.properties | 7 ++ .../worker/config-template/node.properties | 5 ++ .../worker/config-template/velox.properties | 2 + .../worker/scripts/generate-configs.sh | 87 +++++++++++++++++++ 18 files changed, 421 insertions(+) create mode 100644 tools/deployment/presto-clp/.env create mode 100644 tools/deployment/presto-clp/README.md create mode 100644 tools/deployment/presto-clp/coordinator/config-template/clp.properties create mode 100644 tools/deployment/presto-clp/coordinator/config-template/config.properties create mode 100644 tools/deployment/presto-clp/coordinator/config-template/jvm.config create mode 100644 tools/deployment/presto-clp/coordinator/config-template/log.properties create mode 100644 tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json create mode 100644 tools/deployment/presto-clp/coordinator/config-template/node.properties create mode 100644 tools/deployment/presto-clp/coordinator/scripts/$ create mode 100755 tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh create mode 100644 tools/deployment/presto-clp/demo-assets/clp-config.yml create mode 100755 tools/deployment/presto-clp/demo-assets/init.sh create mode 100644 tools/deployment/presto-clp/docker-compose.yaml create mode 100644 tools/deployment/presto-clp/worker/config-template/clp.properties create mode 100644 tools/deployment/presto-clp/worker/config-template/config.properties create mode 100644 tools/deployment/presto-clp/worker/config-template/node.properties create mode 100644 tools/deployment/presto-clp/worker/config-template/velox.properties create mode 100755 tools/deployment/presto-clp/worker/scripts/generate-configs.sh diff --git a/tools/deployment/presto-clp/.env b/tools/deployment/presto-clp/.env new file mode 100644 index 0000000000..b9a4cf785d --- /dev/null +++ b/tools/deployment/presto-clp/.env @@ -0,0 +1,35 @@ +# Coordinator common +PRESTO_COORDINATOR_HTTPPORT="8080" +PRESTO_COORDINATOR_SERVICENAME="presto-coordinator" + +# Coordinator clp.properties +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE="mysql" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL="jdbc:mysql://localhost:6001" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME="clp-db" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER="clp-user" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD="123456" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX="clp_" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER="mysql" + +# Coordinator config.properties +PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY="1GB" +PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE="1GB" + +# Coordinator jvm.config +PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G" +PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE="32M" + +# Coordinator log.properties +PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL="DEBUG" + +# Coordinator node.properties +PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT="production" + +# Worker common +PRESTO_WORKER_HTTPPORT="8080" + +# Worker node.properties +PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location" + +# CLP package archives +CLP_PACKAGE_ARCHIVES=REPLACE_ME diff --git a/tools/deployment/presto-clp/README.md b/tools/deployment/presto-clp/README.md new file mode 100644 index 0000000000..97ccf312e9 --- /dev/null +++ b/tools/deployment/presto-clp/README.md @@ -0,0 +1,76 @@ +# Setup local docker stack for presto + clp + +## Install docker + +Follow the guide here: [docker] + +# Launch clp-package + +1. Find the clp-package for test on our official website [clp-json-v0.4.0]. Here is a sample dataset for demo testing: [postgresql dataset]. + +2. Untar the clp-package and the postgresql dataset. + +3. Replace the content of `/path/to/clp-json-package/etc/clp-config.yml` with the output of `demo-assets/init.sh generate_sample_clp_config`. + +4. Launch: + +```bash +# You probably want to run a python 3.9 or newer virtual environment +sbin/start-clp.sh +``` + +5. Compress: + +```bash +# You can also use your own dataset +sbin/compress.sh --timestamp-key 'timestamp' /path/to/postgresql.log +``` + +6. Use the following command to update `.env`: + +```bash +demo-assets/init.sh update_metadata_config /path/to/clp-json-package +``` + +# Create Docker Cluster + +Create a local docker stack: + +```bash +docker compose up +``` + +To create a docker stack with more than 1 worker (e.g., 3 workers): +``` +docker compose up --scale presto-worker=3 +``` + +# Use cli: + +After all containers are in "Started" states (check by `docker ps`): + +```bash +# On your host +docker exec -it compose-presto-coordinator-1 sh + +# In presto-coordinator container +/opt/presto-cli --catalog clp --schema default --server localhost:8080 +``` + +Example query: +```sql +SELECT * FROM default LIMIT 1; +``` + +# Delete docker Cluster + +```bash +docker compose down +``` + + + +[clp-json-v0.4.0]: https://github.com/y-scope/clp/releases/tag/v0.4.0 +[docker]: https://docs.docker.com/engine/install +[postgresql dataset]: https://zenodo.org/records/10516402 + diff --git a/tools/deployment/presto-clp/coordinator/config-template/clp.properties b/tools/deployment/presto-clp/coordinator/config-template/clp.properties new file mode 100644 index 0000000000..2ee47e75a2 --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/config-template/clp.properties @@ -0,0 +1,10 @@ +connector.name=clp +clp.metadata-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE} +clp.metadata-db-url=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL} +clp.metadata-db-name=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME} +clp.metadata-db-user=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER} +clp.metadata-db-password=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD} +clp.metadata-table-prefix=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX} +clp.split-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER} +clp.metadata-filter-config=/opt/presto-server/etc/metadata-filter.json + diff --git a/tools/deployment/presto-clp/coordinator/config-template/config.properties b/tools/deployment/presto-clp/coordinator/config-template/config.properties new file mode 100644 index 0000000000..22f905fb0c --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/config-template/config.properties @@ -0,0 +1,14 @@ +coordinator=true +node-scheduler.include-coordinator=false +http-server.http.port=${PRESTO_COORDINATOR_HTTPPORT} +query.max-memory=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY} +query.max-memory-per-node=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE} +discovery-server.enabled=true +discovery.uri=http://${PRESTO_COORDINATOR_SERVICENAME}:${PRESTO_COORDINATOR_HTTPPORT} +optimizer.optimize-hash-generation=false +regex-library=RE2J +use-alternative-function-signatures=true +inline-sql-functions=false +nested-data-serialization-enabled=false +native-execution-enabled=true + diff --git a/tools/deployment/presto-clp/coordinator/config-template/jvm.config b/tools/deployment/presto-clp/coordinator/config-template/jvm.config new file mode 100644 index 0000000000..49be4c4c0b --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/config-template/jvm.config @@ -0,0 +1,10 @@ +-server +-Xmx${PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE} +-XX:+UseG1GC +-XX:G1HeapRegionSize=${PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE} +-XX:+UseGCOverheadLimit +-XX:+ExplicitGCInvokesConcurrent +-XX:+HeapDumpOnOutOfMemoryError +-XX:+ExitOnOutOfMemoryError +-Djdk.attach.allowAttachSelf=true + diff --git a/tools/deployment/presto-clp/coordinator/config-template/log.properties b/tools/deployment/presto-clp/coordinator/config-template/log.properties new file mode 100644 index 0000000000..a4ea74216a --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/config-template/log.properties @@ -0,0 +1,2 @@ +com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL} + diff --git a/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json b/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json new file mode 100644 index 0000000000..bfd870e452 --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json @@ -0,0 +1,3 @@ +{ +} + diff --git a/tools/deployment/presto-clp/coordinator/config-template/node.properties b/tools/deployment/presto-clp/coordinator/config-template/node.properties new file mode 100644 index 0000000000..2803273ef9 --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/config-template/node.properties @@ -0,0 +1,3 @@ +node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} +node.id=${PRESTO_COORDINATOR_SERVICENAME} + diff --git a/tools/deployment/presto-clp/coordinator/scripts/$ b/tools/deployment/presto-clp/coordinator/scripts/$ new file mode 100644 index 0000000000..6c37411d0f --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/scripts/$ @@ -0,0 +1,19 @@ +#!/bin/sh + +# Exit on error +set -e + +PRESTO_CONFIG_DIR="/opt/presto-server/etc" + +# Substitute environemnt variables in config template +find /configs -type f | while read -r f; do + ( echo "cat < "${PRESTO_CONFIG_DIR}/$(basename "$f")" +done + +# Setup the config directory hierarchy +rm -rf ${PRESTO_CONFIG_DIR}/catalog +mkdir -p ${PRESTO_CONFIG_DIR}/catalog + +# Copy over files +mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog + diff --git a/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh b/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh new file mode 100755 index 0000000000..4082e6b5d0 --- /dev/null +++ b/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh @@ -0,0 +1,18 @@ +#!/bin/sh + +# Exit on error +set -e + +PRESTO_CONFIG_DIR="/opt/presto-server/etc" + +# Substitute environemnt variables in config template +find /configs -type f | while read -r f; do + ( echo "cat < "${PRESTO_CONFIG_DIR}/$(basename "$f")" +done + +# Setup the config directory hierarchy +rm -f ${PRESTO_CONFIG_DIR}/catalog/* + +# Copy over files +mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog + diff --git a/tools/deployment/presto-clp/demo-assets/clp-config.yml b/tools/deployment/presto-clp/demo-assets/clp-config.yml new file mode 100644 index 0000000000..aa8d472d9c --- /dev/null +++ b/tools/deployment/presto-clp/demo-assets/clp-config.yml @@ -0,0 +1,39 @@ +package: + storage_engine: "clp-s" +database: + type: "mariadb" + host: "${REPLACE_IP}" + port: 6001 + name: "clp-db" +query_scheduler: + host: "${REPLACE_IP}" + port: 6002 + jobs_poll_delay: 0.1 + num_archives_to_search_per_sub_job: 16 + logging_level: "INFO" +queue: + host: "${REPLACE_IP}" + port: 6003 +redis: + host: "${REPLACE_IP}" + port: 6004 + query_backend_database: 0 + compression_backend_database: 1 +reducer: + host: "${REPLACE_IP}" + base_port: 6100 + logging_level: "INFO" + upsert_interval: 100 +results_cache: + host: "${REPLACE_IP}" + port: 6005 + db_name: "clp-query-results" + stream_collection_name: "stream-files" +webui: + host: "localhost" + port: 6000 + logging_level: "INFO" +log_viewer_webui: + host: "localhost" + port: 6006 + diff --git a/tools/deployment/presto-clp/demo-assets/init.sh b/tools/deployment/presto-clp/demo-assets/init.sh new file mode 100755 index 0000000000..61a9244f41 --- /dev/null +++ b/tools/deployment/presto-clp/demo-assets/init.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash + +SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" + +function generate_sample_clp_config { + local ip=$(hostname -i) + local file="${SCRIPT_PATH}/clp-config.yml" + cp "$file" "${file}.bak" + sed -i "s|\${REPLACE_IP}|$ip|g" "$file" + echo "Replaced \${REPLACE_IP} with $ip in $file" +} + +function update_metadata_config { + if [[ $# -ne 1 ]]; then + echo "Usage: update_metadata_config " + return 1 + fi + + local clp_pkg_home=$1 + local clp_config_path="$(readlink -f ${clp_pkg_home})/etc/clp-config.yml" + local credential_path="$(readlink -f ${clp_pkg_home})/etc/credentials.yml" + host=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["host"])' < "$clp_config_path") + port=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["port"])' < "$clp_config_path") + name=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["name"])' < "$clp_config_path") + user=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["user"])' < "$credential_path") + password=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["password"])' < "$credential_path") + echo "Metadata database host: $host" + echo "Metadata database port: $port" + echo "Metadata database name: $name" + echo "Metadata database user: $user" + echo "Metadata database password: $password" + + local env_path="${SCRIPT_PATH}/../.env" + cp "$env_path" "${env_path}.bak" + sed -i "s|^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL=.*|PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL=\"jdbc:mysql://${host}:${port}\"|" "$env_path" + sed -i "s/^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME=.*/PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME=\"${name}\"/" "$env_path" + sed -i "s/^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER=.*/PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER=\"${user}\"/" "$env_path" + sed -i "s/^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD=.*/PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD=\"${password}\"/" "$env_path" + sed -i "s|^CLP_PACKAGE_ARCHIVES=.*|CLP_PACKAGE_ARCHIVES=\"$(readlink -f ${clp_pkg_home})/var/data/archives/default\"|" "$env_path" +} + +if declare -f "$1" > /dev/null; then + "$@" +else + echo "Error: '$1' is not a valid function name." + echo "Available functions:" + declare -F | awk '{print $3}' + exit 1 +fi + diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml new file mode 100644 index 0000000000..33029c7d30 --- /dev/null +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -0,0 +1,39 @@ +version: "3.9" + +services: + presto-coordinator: + image: ghcr.io/y-scope/presto/coordinator:dev + entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] + env_file: + - .env + volumes: + - ./coordinator/scripts:/scripts:ro + - coordinator-config:/opt/presto-server/etc + - ./coordinator/config-template:/configs:ro + networks: + - presto + + presto-worker: + image: ghcr.io/y-scope/presto/prestissimo-worker:dev + depends_on: + presto-coordinator: + condition: service_started + entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] + env_file: + - .env + volumes: + - ./worker/scripts:/scripts:ro + - worker-config:/opt/presto-server/etc + - ./worker/config-template:/configs:ro + - "${CLP_PACKAGE_ARCHIVES}:${CLP_PACKAGE_ARCHIVES}" + networks: + - presto + +volumes: + coordinator-config: + worker-config: + +networks: + presto: + driver: bridge + diff --git a/tools/deployment/presto-clp/worker/config-template/clp.properties b/tools/deployment/presto-clp/worker/config-template/clp.properties new file mode 100644 index 0000000000..484cbfe2c1 --- /dev/null +++ b/tools/deployment/presto-clp/worker/config-template/clp.properties @@ -0,0 +1,2 @@ +connector.name=clp + diff --git a/tools/deployment/presto-clp/worker/config-template/config.properties b/tools/deployment/presto-clp/worker/config-template/config.properties new file mode 100644 index 0000000000..0b8758c8fd --- /dev/null +++ b/tools/deployment/presto-clp/worker/config-template/config.properties @@ -0,0 +1,7 @@ +discovery.uri=http://${PRESTO_COORDINATOR_SERVICENAME}:${PRESTO_COORDINATOR_HTTPPORT} +presto.version=REPLACE_ME +http-server.http.port=${PRESTO_WORKER_HTTPPORT} +shutdown-onset-sec=1 +register-test-functions=false +runtime-metrics-collection-enabled=false + diff --git a/tools/deployment/presto-clp/worker/config-template/node.properties b/tools/deployment/presto-clp/worker/config-template/node.properties new file mode 100644 index 0000000000..44f4cd70b2 --- /dev/null +++ b/tools/deployment/presto-clp/worker/config-template/node.properties @@ -0,0 +1,5 @@ +node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} +node.internal-address=REPLACE_ME +node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION} +node.id=REPLACE_ME + diff --git a/tools/deployment/presto-clp/worker/config-template/velox.properties b/tools/deployment/presto-clp/worker/config-template/velox.properties new file mode 100644 index 0000000000..50de3f6962 --- /dev/null +++ b/tools/deployment/presto-clp/worker/config-template/velox.properties @@ -0,0 +1,2 @@ +mutable-config=true + diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh new file mode 100755 index 0000000000..8df961e47d --- /dev/null +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -0,0 +1,87 @@ +#!/bin/sh + +# Install wget +apt-get update && apt-get install -y wget + +PRESTO_CONFIG_DIR="/opt/presto-server/etc" + +# Substitute environemnt variables in config template +find /configs -type f | while read -r f; do + ( echo "cat < "${PRESTO_CONFIG_DIR}/$(basename "$f")" +done + +# Setup the config directory hierarchy +rm -f ${PRESTO_CONFIG_DIR}/catalog/* + +mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog + +# Update "presto.version" parameter in config.properties file using values from coordinator +CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" + +# Retry configuration +MAX_RETRIES=30 +RETRY_DELAY=10 + +echo "Init container: Waiting for Presto to be ready..." + +# 1. Fetch version info from Presto with retry logic +retry_count=0 +while [ $retry_count -lt $MAX_RETRIES ]; do +echo "Attempt $((retry_count + 1))/$MAX_RETRIES - Checking Presto availability..." + +# Try to fetch the response +DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ { print $2 }' "${PRESTO_CONFIG_DIR}/config.properties") +if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then + # Check if response is not empty and contains version info + if [ -n "$response" ] && echo "$response" | grep -q '"version"'; then + echo "Presto is ready!" + break + fi +fi + +echo "Presto not ready yet, retrying in ${RETRY_DELAY}s..." +sleep $RETRY_DELAY +retry_count=$((retry_count + 1)) +done + +# Check if we exceeded max retries +if [ $retry_count -eq $MAX_RETRIES ]; then + echo "Error: Presto did not become ready after $MAX_RETRIES attempts" + exit 1 +fi + +# 2. Extract the version using grep and sed (busybox compatible) +version=$(echo "$response" | grep -o '"version":"[^"]*"' | sed 's/"version":"//;s/"//') + +echo "Detected Presto version: $version" + +# 3. Replace `presto.version=REPLACE_ME` with actual version in the config file +if grep -q '^presto.version=REPLACE_ME' "$CONFIG_PROPERTIES_FILE"; then + sed -i "s|^presto.version=REPLACE_ME|presto.version=$version|" "$CONFIG_PROPERTIES_FILE" + echo "Updated $CONFIG_PROPERTIES_FILE with version $version" +else + echo "Warning: 'presto.version=REPLACE_ME' not found in $CONFIG_PROPERTIES_FILE" + exit 1 +fi + +# Modify node.properties +NODE_PROPERTIES_FILE="/opt/presto-server/etc/node.properties" +INTERNAL_ADDRESS=$(hostname -i) +# Replace `node.internal-address=REPLACE_ME` with actual ip address in the config file +if grep -q '^node.internal-address=REPLACE_ME' "$NODE_PROPERTIES_FILE"; then + sed -i "s|^node.internal-address=REPLACE_ME|node.internal-address=${INTERNAL_ADDRESS}|" "$NODE_PROPERTIES_FILE" + echo "Updated $NODE_PROPERTIES_FILE with node.internal-address ${INTERNAL_ADDRESS}" +else + echo "Warning: 'node.internal-address=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" + exit 1 +fi + +# Replace `node.id=REPLACE_ME` with actual hostname in the config file +if grep -q '^node.id=REPLACE_ME' "$NODE_PROPERTIES_FILE"; then + sed -i "s|^node.id=REPLACE_ME|node.id=$HOSTNAME|" "$NODE_PROPERTIES_FILE" + echo "Updated $NODE_PROPERTIES_FILE with node.id $HOSTNAME" +else + echo "Warning: 'node.id=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" + exit 1 +fi + From 9574059d7755968c5e86fbcca293788c669bdcb0 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 27 Jul 2025 15:45:06 -0400 Subject: [PATCH 02/42] Add new files to yaml linting; Fix yamllint violations. --- taskfiles/lint.yaml | 3 +- .../presto-clp/demo-assets/clp-config.yml | 1 - .../deployment/presto-clp/docker-compose.yaml | 29 +++++++++---------- 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/taskfiles/lint.yaml b/taskfiles/lint.yaml index d740b610f3..16267e265d 100644 --- a/taskfiles/lint.yaml +++ b/taskfiles/lint.yaml @@ -103,7 +103,8 @@ tasks: components/package-template/src/etc \ docs \ taskfile.yaml \ - taskfiles + taskfiles \ + tools/deployment check-cpp-format: sources: &cpp_source_files diff --git a/tools/deployment/presto-clp/demo-assets/clp-config.yml b/tools/deployment/presto-clp/demo-assets/clp-config.yml index aa8d472d9c..49291183e7 100644 --- a/tools/deployment/presto-clp/demo-assets/clp-config.yml +++ b/tools/deployment/presto-clp/demo-assets/clp-config.yml @@ -36,4 +36,3 @@ webui: log_viewer_webui: host: "localhost" port: 6006 - diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml index 33029c7d30..415aab003e 100644 --- a/tools/deployment/presto-clp/docker-compose.yaml +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -2,32 +2,32 @@ version: "3.9" services: presto-coordinator: - image: ghcr.io/y-scope/presto/coordinator:dev + image: "ghcr.io/y-scope/presto/coordinator:dev" entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] env_file: - - .env + - ".env" volumes: - - ./coordinator/scripts:/scripts:ro - - coordinator-config:/opt/presto-server/etc - - ./coordinator/config-template:/configs:ro + - "./coordinator/scripts:/scripts:ro" + - "coordinator-config:/opt/presto-server/etc" + - "./coordinator/config-template:/configs:ro" networks: - - presto + - "presto" presto-worker: - image: ghcr.io/y-scope/presto/prestissimo-worker:dev + image: "ghcr.io/y-scope/presto/prestissimo-worker:dev" depends_on: presto-coordinator: - condition: service_started + condition: "service_started" entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] env_file: - - .env + - ".env" volumes: - - ./worker/scripts:/scripts:ro - - worker-config:/opt/presto-server/etc - - ./worker/config-template:/configs:ro + - "./worker/scripts:/scripts:ro" + - "worker-config:/opt/presto-server/etc" + - "./worker/config-template:/configs:ro" - "${CLP_PACKAGE_ARCHIVES}:${CLP_PACKAGE_ARCHIVES}" networks: - - presto + - "presto" volumes: coordinator-config: @@ -35,5 +35,4 @@ volumes: networks: presto: - driver: bridge - + driver: "bridge" From 8dd8795acb9b0b235673d4935f5b2ab594ae297b Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 27 Jul 2025 15:47:44 -0400 Subject: [PATCH 03/42] Remove unnecessarily blank lines. --- tools/deployment/presto-clp/README.md | 1 - .../presto-clp/coordinator/config-template/clp.properties | 1 - .../presto-clp/coordinator/config-template/config.properties | 1 - .../deployment/presto-clp/coordinator/config-template/jvm.config | 1 - .../presto-clp/coordinator/config-template/log.properties | 1 - .../presto-clp/coordinator/config-template/metadata-filter.json | 1 - .../presto-clp/coordinator/config-template/node.properties | 1 - tools/deployment/presto-clp/coordinator/scripts/$ | 1 - .../presto-clp/coordinator/scripts/generate-configs.sh | 1 - tools/deployment/presto-clp/demo-assets/init.sh | 1 - .../deployment/presto-clp/worker/config-template/clp.properties | 1 - .../presto-clp/worker/config-template/config.properties | 1 - .../deployment/presto-clp/worker/config-template/node.properties | 1 - .../presto-clp/worker/config-template/velox.properties | 1 - tools/deployment/presto-clp/worker/scripts/generate-configs.sh | 1 - 15 files changed, 15 deletions(-) diff --git a/tools/deployment/presto-clp/README.md b/tools/deployment/presto-clp/README.md index 97ccf312e9..6e34dd3d76 100644 --- a/tools/deployment/presto-clp/README.md +++ b/tools/deployment/presto-clp/README.md @@ -73,4 +73,3 @@ docker compose down [clp-json-v0.4.0]: https://github.com/y-scope/clp/releases/tag/v0.4.0 [docker]: https://docs.docker.com/engine/install [postgresql dataset]: https://zenodo.org/records/10516402 - diff --git a/tools/deployment/presto-clp/coordinator/config-template/clp.properties b/tools/deployment/presto-clp/coordinator/config-template/clp.properties index 2ee47e75a2..cdf06e716a 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/clp.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/clp.properties @@ -7,4 +7,3 @@ clp.metadata-db-password=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATA clp.metadata-table-prefix=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX} clp.split-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER} clp.metadata-filter-config=/opt/presto-server/etc/metadata-filter.json - diff --git a/tools/deployment/presto-clp/coordinator/config-template/config.properties b/tools/deployment/presto-clp/coordinator/config-template/config.properties index 22f905fb0c..5207ac480e 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/config.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/config.properties @@ -11,4 +11,3 @@ use-alternative-function-signatures=true inline-sql-functions=false nested-data-serialization-enabled=false native-execution-enabled=true - diff --git a/tools/deployment/presto-clp/coordinator/config-template/jvm.config b/tools/deployment/presto-clp/coordinator/config-template/jvm.config index 49be4c4c0b..0b05b439ab 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/jvm.config +++ b/tools/deployment/presto-clp/coordinator/config-template/jvm.config @@ -7,4 +7,3 @@ -XX:+HeapDumpOnOutOfMemoryError -XX:+ExitOnOutOfMemoryError -Djdk.attach.allowAttachSelf=true - diff --git a/tools/deployment/presto-clp/coordinator/config-template/log.properties b/tools/deployment/presto-clp/coordinator/config-template/log.properties index a4ea74216a..6578ca10f2 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/log.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/log.properties @@ -1,2 +1 @@ com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL} - diff --git a/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json b/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json index bfd870e452..2c63c08510 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json +++ b/tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json @@ -1,3 +1,2 @@ { } - diff --git a/tools/deployment/presto-clp/coordinator/config-template/node.properties b/tools/deployment/presto-clp/coordinator/config-template/node.properties index 2803273ef9..3397f6e9ec 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/node.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/node.properties @@ -1,3 +1,2 @@ node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} node.id=${PRESTO_COORDINATOR_SERVICENAME} - diff --git a/tools/deployment/presto-clp/coordinator/scripts/$ b/tools/deployment/presto-clp/coordinator/scripts/$ index 6c37411d0f..8b38fa5bee 100644 --- a/tools/deployment/presto-clp/coordinator/scripts/$ +++ b/tools/deployment/presto-clp/coordinator/scripts/$ @@ -16,4 +16,3 @@ mkdir -p ${PRESTO_CONFIG_DIR}/catalog # Copy over files mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog - diff --git a/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh b/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh index 4082e6b5d0..326993ba09 100755 --- a/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh @@ -15,4 +15,3 @@ rm -f ${PRESTO_CONFIG_DIR}/catalog/* # Copy over files mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog - diff --git a/tools/deployment/presto-clp/demo-assets/init.sh b/tools/deployment/presto-clp/demo-assets/init.sh index 61a9244f41..3f9f450390 100755 --- a/tools/deployment/presto-clp/demo-assets/init.sh +++ b/tools/deployment/presto-clp/demo-assets/init.sh @@ -47,4 +47,3 @@ else declare -F | awk '{print $3}' exit 1 fi - diff --git a/tools/deployment/presto-clp/worker/config-template/clp.properties b/tools/deployment/presto-clp/worker/config-template/clp.properties index 484cbfe2c1..4ec6f9a4c7 100644 --- a/tools/deployment/presto-clp/worker/config-template/clp.properties +++ b/tools/deployment/presto-clp/worker/config-template/clp.properties @@ -1,2 +1 @@ connector.name=clp - diff --git a/tools/deployment/presto-clp/worker/config-template/config.properties b/tools/deployment/presto-clp/worker/config-template/config.properties index 0b8758c8fd..fdbf89d734 100644 --- a/tools/deployment/presto-clp/worker/config-template/config.properties +++ b/tools/deployment/presto-clp/worker/config-template/config.properties @@ -4,4 +4,3 @@ http-server.http.port=${PRESTO_WORKER_HTTPPORT} shutdown-onset-sec=1 register-test-functions=false runtime-metrics-collection-enabled=false - diff --git a/tools/deployment/presto-clp/worker/config-template/node.properties b/tools/deployment/presto-clp/worker/config-template/node.properties index 44f4cd70b2..7e0e1e6b5d 100644 --- a/tools/deployment/presto-clp/worker/config-template/node.properties +++ b/tools/deployment/presto-clp/worker/config-template/node.properties @@ -2,4 +2,3 @@ node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} node.internal-address=REPLACE_ME node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION} node.id=REPLACE_ME - diff --git a/tools/deployment/presto-clp/worker/config-template/velox.properties b/tools/deployment/presto-clp/worker/config-template/velox.properties index 50de3f6962..8298bf6790 100644 --- a/tools/deployment/presto-clp/worker/config-template/velox.properties +++ b/tools/deployment/presto-clp/worker/config-template/velox.properties @@ -1,2 +1 @@ mutable-config=true - diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 8df961e47d..c42261e58a 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -84,4 +84,3 @@ else echo "Warning: 'node.id=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" exit 1 fi - From a8af40c471bf1f6da2fc17de5cdfdd92cd2c4262 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 27 Jul 2025 15:48:00 -0400 Subject: [PATCH 04/42] Remove unnecesary script. --- .../presto-clp/coordinator/scripts/$ | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 tools/deployment/presto-clp/coordinator/scripts/$ diff --git a/tools/deployment/presto-clp/coordinator/scripts/$ b/tools/deployment/presto-clp/coordinator/scripts/$ deleted file mode 100644 index 8b38fa5bee..0000000000 --- a/tools/deployment/presto-clp/coordinator/scripts/$ +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/sh - -# Exit on error -set -e - -PRESTO_CONFIG_DIR="/opt/presto-server/etc" - -# Substitute environemnt variables in config template -find /configs -type f | while read -r f; do - ( echo "cat < "${PRESTO_CONFIG_DIR}/$(basename "$f")" -done - -# Setup the config directory hierarchy -rm -rf ${PRESTO_CONFIG_DIR}/catalog -mkdir -p ${PRESTO_CONFIG_DIR}/catalog - -# Copy over files -mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog From 2015a4a06a11bf73601839b35bfec282ef48ce70 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 27 Jul 2025 16:22:34 -0400 Subject: [PATCH 05/42] Replace demo-assets/init.sh and demo CLP config file with more robust Python script; Split environment variable file into multiple files. --- taskfiles/lint.yaml | 1 + tools/deployment/presto-clp/.env | 35 ----- tools/deployment/presto-clp/README.md | 6 +- .../presto-clp/coordinator-common.env | 5 + tools/deployment/presto-clp/coordinator.env | 14 ++ .../presto-clp/demo-assets/clp-config.yml | 38 ------ .../deployment/presto-clp/demo-assets/init.sh | 49 ------- .../deployment/presto-clp/docker-compose.yaml | 4 + .../deployment/presto-clp/scripts/.gitignore | 1 + .../scripts/generate-user-env-vars-file.py | 120 ++++++++++++++++++ .../presto-clp/scripts/requirements.txt | 1 + .../presto-clp/scripts/set-up-config.sh | 28 ++++ tools/deployment/presto-clp/worker.env | 4 + 13 files changed, 180 insertions(+), 126 deletions(-) delete mode 100644 tools/deployment/presto-clp/.env create mode 100644 tools/deployment/presto-clp/coordinator-common.env create mode 100644 tools/deployment/presto-clp/coordinator.env delete mode 100644 tools/deployment/presto-clp/demo-assets/clp-config.yml delete mode 100755 tools/deployment/presto-clp/demo-assets/init.sh create mode 100644 tools/deployment/presto-clp/scripts/.gitignore create mode 100644 tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py create mode 100644 tools/deployment/presto-clp/scripts/requirements.txt create mode 100755 tools/deployment/presto-clp/scripts/set-up-config.sh create mode 100644 tools/deployment/presto-clp/worker.env diff --git a/taskfiles/lint.yaml b/taskfiles/lint.yaml index 16267e265d..61c1371169 100644 --- a/taskfiles/lint.yaml +++ b/taskfiles/lint.yaml @@ -773,6 +773,7 @@ tasks: - "components/clp-py-utils/clp_py_utils" - "components/core/tools/scripts/utils" - "components/job-orchestration/job_orchestration" + - "tools/deployment" - "tools/scripts" - "docs/conf" cmd: |- diff --git a/tools/deployment/presto-clp/.env b/tools/deployment/presto-clp/.env deleted file mode 100644 index b9a4cf785d..0000000000 --- a/tools/deployment/presto-clp/.env +++ /dev/null @@ -1,35 +0,0 @@ -# Coordinator common -PRESTO_COORDINATOR_HTTPPORT="8080" -PRESTO_COORDINATOR_SERVICENAME="presto-coordinator" - -# Coordinator clp.properties -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE="mysql" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL="jdbc:mysql://localhost:6001" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME="clp-db" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER="clp-user" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD="123456" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX="clp_" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER="mysql" - -# Coordinator config.properties -PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY="1GB" -PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE="1GB" - -# Coordinator jvm.config -PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G" -PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE="32M" - -# Coordinator log.properties -PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL="DEBUG" - -# Coordinator node.properties -PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT="production" - -# Worker common -PRESTO_WORKER_HTTPPORT="8080" - -# Worker node.properties -PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location" - -# CLP package archives -CLP_PACKAGE_ARCHIVES=REPLACE_ME diff --git a/tools/deployment/presto-clp/README.md b/tools/deployment/presto-clp/README.md index 6e34dd3d76..e35599ee4a 100644 --- a/tools/deployment/presto-clp/README.md +++ b/tools/deployment/presto-clp/README.md @@ -10,9 +10,7 @@ Follow the guide here: [docker] 2. Untar the clp-package and the postgresql dataset. -3. Replace the content of `/path/to/clp-json-package/etc/clp-config.yml` with the output of `demo-assets/init.sh generate_sample_clp_config`. - -4. Launch: +3. Launch: ```bash # You probably want to run a python 3.9 or newer virtual environment @@ -29,7 +27,7 @@ sbin/compress.sh --timestamp-key 'timestamp' /path/to/postgresql.log 6. Use the following command to update `.env`: ```bash -demo-assets/init.sh update_metadata_config /path/to/clp-json-package +scripts/set-up-config.sh /path/to/clp-json-package ``` # Create Docker Cluster diff --git a/tools/deployment/presto-clp/coordinator-common.env b/tools/deployment/presto-clp/coordinator-common.env new file mode 100644 index 0000000000..e746f6d5df --- /dev/null +++ b/tools/deployment/presto-clp/coordinator-common.env @@ -0,0 +1,5 @@ +PRESTO_COORDINATOR_HTTPPORT="8080" +PRESTO_COORDINATOR_SERVICENAME="presto-coordinator" + +# node.properties +PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT="production" diff --git a/tools/deployment/presto-clp/coordinator.env b/tools/deployment/presto-clp/coordinator.env new file mode 100644 index 0000000000..521a80feab --- /dev/null +++ b/tools/deployment/presto-clp/coordinator.env @@ -0,0 +1,14 @@ +# clp.properties +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE="mysql" +PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER="mysql" + +# config.properties +PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY="1GB" +PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE="1GB" + +# jvm.config +PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G" +PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE="32M" + +# log.properties +PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL="DEBUG" diff --git a/tools/deployment/presto-clp/demo-assets/clp-config.yml b/tools/deployment/presto-clp/demo-assets/clp-config.yml deleted file mode 100644 index 49291183e7..0000000000 --- a/tools/deployment/presto-clp/demo-assets/clp-config.yml +++ /dev/null @@ -1,38 +0,0 @@ -package: - storage_engine: "clp-s" -database: - type: "mariadb" - host: "${REPLACE_IP}" - port: 6001 - name: "clp-db" -query_scheduler: - host: "${REPLACE_IP}" - port: 6002 - jobs_poll_delay: 0.1 - num_archives_to_search_per_sub_job: 16 - logging_level: "INFO" -queue: - host: "${REPLACE_IP}" - port: 6003 -redis: - host: "${REPLACE_IP}" - port: 6004 - query_backend_database: 0 - compression_backend_database: 1 -reducer: - host: "${REPLACE_IP}" - base_port: 6100 - logging_level: "INFO" - upsert_interval: 100 -results_cache: - host: "${REPLACE_IP}" - port: 6005 - db_name: "clp-query-results" - stream_collection_name: "stream-files" -webui: - host: "localhost" - port: 6000 - logging_level: "INFO" -log_viewer_webui: - host: "localhost" - port: 6006 diff --git a/tools/deployment/presto-clp/demo-assets/init.sh b/tools/deployment/presto-clp/demo-assets/init.sh deleted file mode 100755 index 3f9f450390..0000000000 --- a/tools/deployment/presto-clp/demo-assets/init.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env bash - -SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -function generate_sample_clp_config { - local ip=$(hostname -i) - local file="${SCRIPT_PATH}/clp-config.yml" - cp "$file" "${file}.bak" - sed -i "s|\${REPLACE_IP}|$ip|g" "$file" - echo "Replaced \${REPLACE_IP} with $ip in $file" -} - -function update_metadata_config { - if [[ $# -ne 1 ]]; then - echo "Usage: update_metadata_config " - return 1 - fi - - local clp_pkg_home=$1 - local clp_config_path="$(readlink -f ${clp_pkg_home})/etc/clp-config.yml" - local credential_path="$(readlink -f ${clp_pkg_home})/etc/credentials.yml" - host=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["host"])' < "$clp_config_path") - port=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["port"])' < "$clp_config_path") - name=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["name"])' < "$clp_config_path") - user=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["user"])' < "$credential_path") - password=$(python3 -c 'import sys, yaml; print(yaml.load(sys.stdin)["database"]["password"])' < "$credential_path") - echo "Metadata database host: $host" - echo "Metadata database port: $port" - echo "Metadata database name: $name" - echo "Metadata database user: $user" - echo "Metadata database password: $password" - - local env_path="${SCRIPT_PATH}/../.env" - cp "$env_path" "${env_path}.bak" - sed -i "s|^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL=.*|PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL=\"jdbc:mysql://${host}:${port}\"|" "$env_path" - sed -i "s/^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME=.*/PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME=\"${name}\"/" "$env_path" - sed -i "s/^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER=.*/PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER=\"${user}\"/" "$env_path" - sed -i "s/^PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD=.*/PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD=\"${password}\"/" "$env_path" - sed -i "s|^CLP_PACKAGE_ARCHIVES=.*|CLP_PACKAGE_ARCHIVES=\"$(readlink -f ${clp_pkg_home})/var/data/archives/default\"|" "$env_path" -} - -if declare -f "$1" > /dev/null; then - "$@" -else - echo "Error: '$1' is not a valid function name." - echo "Available functions:" - declare -F | awk '{print $3}' - exit 1 -fi diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml index 415aab003e..de56cd8578 100644 --- a/tools/deployment/presto-clp/docker-compose.yaml +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -6,6 +6,8 @@ services: entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] env_file: - ".env" + - "coordinator-common.env" + - "coordinator.env" volumes: - "./coordinator/scripts:/scripts:ro" - "coordinator-config:/opt/presto-server/etc" @@ -21,6 +23,8 @@ services: entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] env_file: - ".env" + - "coordinator-common.env" + - "worker.env" volumes: - "./worker/scripts:/scripts:ro" - "worker-config:/opt/presto-server/etc" diff --git a/tools/deployment/presto-clp/scripts/.gitignore b/tools/deployment/presto-clp/scripts/.gitignore new file mode 100644 index 0000000000..ef81b1e243 --- /dev/null +++ b/tools/deployment/presto-clp/scripts/.gitignore @@ -0,0 +1 @@ +/.venv/ diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py new file mode 100644 index 0000000000..cfcb3c3466 --- /dev/null +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -0,0 +1,120 @@ +import argparse +import logging +import sys +from pathlib import Path +from typing import Optional + +import yaml + +# Set up console logging +logging_console_handler = logging.StreamHandler() +logging_formatter = logging.Formatter( + "%(asctime)s.%(msecs)03d %(levelname)s [%(module)s] %(message)s", datefmt="%Y-%m-%dT%H:%M:%S" +) +logging_console_handler.setFormatter(logging_formatter) + +# Set up root logger +root_logger = logging.getLogger() +root_logger.setLevel(logging.INFO) +root_logger.addHandler(logging_console_handler) + +# Create logger +logger = logging.getLogger(__name__) + + +def main(argv=None) -> int: + if argv is None: + argv = sys.argv + + args_parser = argparse.ArgumentParser( + description="Generates an environment variables file for any user-configured properties." + ) + args_parser.add_argument( + "--clp-package-dir", help="CLP package directory.", required=True, type=Path + ) + args_parser.add_argument( + "--output-file", help="Path for the environment variables file.", required=True, type=Path + ) + + parsed_args = args_parser.parse_args(argv[1:]) + clp_package_dir: Path = parsed_args.clp_package_dir.resolve() + output_file: Path = parsed_args.output_file + + clp_config_file_path = clp_package_dir / "etc" / "clp-config.yml" + with open(clp_config_file_path, "r") as clp_config_file: + clp_config = yaml.safe_load(clp_config_file) + + database_host = _get_config_value(clp_config, "database.host", "localhost") + database_port = _get_config_value(clp_config, "database.port", 3306) + database_name = _get_config_value(clp_config, "database.name", "clp-db") + + clp_archive_output_storage_type = _get_config_value( + clp_config, "archive_output.storage.type", "fs" + ) + if "fs" != clp_archive_output_storage_type: + logger.error( + "Expected CLP's archive_output.storage.type to be fs but found '%s'. Presto currently only supports" + " reading archives from the fs storage type.", + clp_archive_output_storage_type, + ) + + clp_archives_dir = _get_config_value( + clp_config, + "archive_output.storage.directory", + str(clp_package_dir / "var" / "data" / "archives"), + ) + + credentials_file_path = clp_package_dir / "etc" / "credentials.yml" + with open(credentials_file_path, "r") as credentials_file: + credentials = yaml.safe_load(credentials_file) + + database_user = _get_config_value(credentials, "database.user") + database_password = _get_config_value(credentials, "database.password") + if not database_user or not database_password: + logger.error( + "database.user and database.password must be specified in '%s'.", credentials_file_path + ) + return 1 + + with open(output_file, "w") as env_file: + env_file.write( + "PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL" + f"=jdbc:mysql://{database_host}:{database_port}\n" + ) + env_file.write( + f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME={database_name}\n" + ) + env_file.write( + f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER={database_user}\n" + ) + env_file.write( + f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD={database_password}\n" + ) + env_file.write(f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX=clp_\n") + env_file.write(f"CLP_PACKAGE_ARCHIVES={clp_archives_dir}\n") + + return 0 + + +def _get_config_value(config: dict, key: str, default_value: Optional[str] = None) -> str: + """ + Gets the value corresponding to `key` from `config` if it exists. + + :param config: The config. + :param key: The key to look for in the config, in dot notation (e.g., "database.host"). + :param default_value: The value to return if `key` doesn't exist in `config`. + :return: The value corresponding to `key` if it exists, otherwise `default_value`. + """ + + keys = key.split(".") + value = config + for k in keys: + if isinstance(value, dict) and k in value: + value = value[k] + else: + return default_value + return value + + +if "__main__" == __name__: + sys.exit(main(sys.argv)) diff --git a/tools/deployment/presto-clp/scripts/requirements.txt b/tools/deployment/presto-clp/scripts/requirements.txt new file mode 100644 index 0000000000..5500f007d0 --- /dev/null +++ b/tools/deployment/presto-clp/scripts/requirements.txt @@ -0,0 +1 @@ +PyYAML diff --git a/tools/deployment/presto-clp/scripts/set-up-config.sh b/tools/deployment/presto-clp/scripts/set-up-config.sh new file mode 100755 index 0000000000..7512001f42 --- /dev/null +++ b/tools/deployment/presto-clp/scripts/set-up-config.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash + +set -eu +set -o pipefail + +script_dir=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +cUsage="Usage: ${BASH_SOURCE[0]} " +if [ "$#" -lt 1 ] ; then + echo "$cUsage" + exit +fi +clp_package_dir=$1 + +venv_dir=${script_dir}/.venv +if [ ! -d "${venv_dir}" ]; then + echo "Setting up Python venv in '${venv_dir}'..." + python3 -m venv "${script_dir}/.venv" +fi +source "${script_dir}/.venv/bin/activate" + +echo "Installing required Python packages..." +pip3 install -r "${script_dir}/requirements.txt" + +echo "Generating environment variables file for user-configured properties..." +python3 "${script_dir}/generate-user-env-vars-file.py" \ + --clp-package-dir "${clp_package_dir}" \ + --output-file "${script_dir}/../.env" diff --git a/tools/deployment/presto-clp/worker.env b/tools/deployment/presto-clp/worker.env new file mode 100644 index 0000000000..19ae8c58cd --- /dev/null +++ b/tools/deployment/presto-clp/worker.env @@ -0,0 +1,4 @@ +PRESTO_WORKER_HTTPPORT="8080" + +# node.properties +PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location" From b78682fddf88921b7ad51c9c17aa82daacf5b615 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Sun, 27 Jul 2025 16:48:39 -0400 Subject: [PATCH 06/42] Add missing return on error. --- .../deployment/presto-clp/scripts/generate-user-env-vars-file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py index cfcb3c3466..a10f2fcff6 100644 --- a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -57,6 +57,7 @@ def main(argv=None) -> int: " reading archives from the fs storage type.", clp_archive_output_storage_type, ) + return 1 clp_archives_dir = _get_config_value( clp_config, From 1247bba40b1388ad5bc2f0f400784f001e5c83d9 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 00:36:03 -0400 Subject: [PATCH 07/42] Apply shell linters to worker/scripts/generate-configs.sh --- .../worker/scripts/generate-configs.sh | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index c42261e58a..f1b28ae179 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -7,7 +7,11 @@ PRESTO_CONFIG_DIR="/opt/presto-server/etc" # Substitute environemnt variables in config template find /configs -type f | while read -r f; do - ( echo "cat < "${PRESTO_CONFIG_DIR}/$(basename "$f")" + ( + echo "cat <"${PRESTO_CONFIG_DIR}/$(basename "$f")" done # Setup the config directory hierarchy @@ -27,21 +31,21 @@ echo "Init container: Waiting for Presto to be ready..." # 1. Fetch version info from Presto with retry logic retry_count=0 while [ $retry_count -lt $MAX_RETRIES ]; do -echo "Attempt $((retry_count + 1))/$MAX_RETRIES - Checking Presto availability..." - -# Try to fetch the response -DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ { print $2 }' "${PRESTO_CONFIG_DIR}/config.properties") -if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then - # Check if response is not empty and contains version info - if [ -n "$response" ] && echo "$response" | grep -q '"version"'; then - echo "Presto is ready!" - break + echo "Attempt $((retry_count + 1))/$MAX_RETRIES - Checking Presto availability..." + + # Try to fetch the response + DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ {print $2}' "${PRESTO_CONFIG_DIR}/config.properties") + if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then + # Check if response is not empty and contains version info + if [ -n "$response" ] && echo "$response" | grep -q '"version"'; then + echo "Presto is ready!" + break + fi fi -fi -echo "Presto not ready yet, retrying in ${RETRY_DELAY}s..." -sleep $RETRY_DELAY -retry_count=$((retry_count + 1)) + echo "Presto not ready yet, retrying in ${RETRY_DELAY}s..." + sleep $RETRY_DELAY + retry_count=$((retry_count + 1)) done # Check if we exceeded max retries @@ -69,7 +73,9 @@ NODE_PROPERTIES_FILE="/opt/presto-server/etc/node.properties" INTERNAL_ADDRESS=$(hostname -i) # Replace `node.internal-address=REPLACE_ME` with actual ip address in the config file if grep -q '^node.internal-address=REPLACE_ME' "$NODE_PROPERTIES_FILE"; then - sed -i "s|^node.internal-address=REPLACE_ME|node.internal-address=${INTERNAL_ADDRESS}|" "$NODE_PROPERTIES_FILE" + sed -i \ + "s|^node.internal-address=REPLACE_ME|node.internal-address=${INTERNAL_ADDRESS}|" \ + "$NODE_PROPERTIES_FILE" echo "Updated $NODE_PROPERTIES_FILE with node.internal-address ${INTERNAL_ADDRESS}" else echo "Warning: 'node.internal-address=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" @@ -78,8 +84,8 @@ fi # Replace `node.id=REPLACE_ME` with actual hostname in the config file if grep -q '^node.id=REPLACE_ME' "$NODE_PROPERTIES_FILE"; then - sed -i "s|^node.id=REPLACE_ME|node.id=$HOSTNAME|" "$NODE_PROPERTIES_FILE" - echo "Updated $NODE_PROPERTIES_FILE with node.id $HOSTNAME" + sed -i "s|^node.id=REPLACE_ME|node.id=$(hostname)|" "$NODE_PROPERTIES_FILE" + echo "Updated $NODE_PROPERTIES_FILE with node.id $(hostname)" else echo "Warning: 'node.id=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" exit 1 From 1400a9022013de944876eb8b3b15171acfa9844c Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 00:46:11 -0400 Subject: [PATCH 08/42] Use Docker compose to wait for the coordinator to be ready. --- .../config-template/config.properties | 2 +- .../deployment/presto-clp/docker-compose.yaml | 10 ++- .../scripts/generate-user-env-vars-file.py | 86 ++++++++++++++----- .../presto-clp/scripts/requirements.txt | 1 + .../worker/config-template/config.properties | 2 +- .../worker/scripts/generate-configs.sh | 39 +++------ 6 files changed, 88 insertions(+), 52 deletions(-) diff --git a/tools/deployment/presto-clp/coordinator/config-template/config.properties b/tools/deployment/presto-clp/coordinator/config-template/config.properties index 5207ac480e..eb65c2488f 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/config.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/config.properties @@ -4,7 +4,7 @@ http-server.http.port=${PRESTO_COORDINATOR_HTTPPORT} query.max-memory=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY} query.max-memory-per-node=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE} discovery-server.enabled=true -discovery.uri=http://${PRESTO_COORDINATOR_SERVICENAME}:${PRESTO_COORDINATOR_HTTPPORT} +discovery.uri=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI} optimizer.optimize-hash-generation=false regex-library=RE2J use-alternative-function-signatures=true diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml index de56cd8578..5d37b21e7f 100644 --- a/tools/deployment/presto-clp/docker-compose.yaml +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -14,12 +14,20 @@ services: - "./coordinator/config-template:/configs:ro" networks: - "presto" + healthcheck: + test: + - "CMD" + - "curl" + - "-f" + - "${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI}/v1/info" + interval: "10s" + retries: 30 presto-worker: image: "ghcr.io/y-scope/presto/prestissimo-worker:dev" depends_on: presto-coordinator: - condition: "service_started" + condition: "service_healthy" entrypoint: ["/bin/bash", "-c", "/scripts/generate-configs.sh && /opt/entrypoint.sh"] env_file: - ".env" diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py index a10f2fcff6..c3ba339545 100644 --- a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -2,9 +2,10 @@ import logging import sys from pathlib import Path -from typing import Optional +from typing import Dict, Optional import yaml +from dotenv import dotenv_values # Set up console logging logging_console_handler = logging.StreamHandler() @@ -40,30 +41,60 @@ def main(argv=None) -> int: clp_package_dir: Path = parsed_args.clp_package_dir.resolve() output_file: Path = parsed_args.output_file + env_vars: Dict[str, str] = {} + if not _add_clp_env_vars(clp_package_dir, env_vars): + return 1 + + script_dir = Path(__file__).parent.resolve() + if not _add_worker_env_vars(script_dir.parent / "coordinator-common.env", env_vars): + return 1 + + with open(output_file, "w") as output_file_handle: + for key, value in env_vars.items(): + output_file_handle.write(f"{key}={value}\n") + + return 0 + + +def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: + """ + Adds environment variables for CLP config values to `env_vars`. + + :param clp_package_dir: + :param env_vars: + :return: Whether the environment variables were successfully added. + """ + env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX"] = "clp_" + clp_config_file_path = clp_package_dir / "etc" / "clp-config.yml" with open(clp_config_file_path, "r") as clp_config_file: clp_config = yaml.safe_load(clp_config_file) database_host = _get_config_value(clp_config, "database.host", "localhost") - database_port = _get_config_value(clp_config, "database.port", 3306) + database_port = _get_config_value(clp_config, "database.port", str(3306)) database_name = _get_config_value(clp_config, "database.name", "clp-db") + env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL"] = ( + f"=jdbc:mysql://{database_host}:{database_port}" + ) + env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME"] = database_name clp_archive_output_storage_type = _get_config_value( clp_config, "archive_output.storage.type", "fs" ) if "fs" != clp_archive_output_storage_type: logger.error( - "Expected CLP's archive_output.storage.type to be fs but found '%s'. Presto currently only supports" - " reading archives from the fs storage type.", + "Expected CLP's archive_output.storage.type to be fs but found '%s'. Presto" + " currently only supports reading archives from the fs storage type.", clp_archive_output_storage_type, ) - return 1 + return False clp_archives_dir = _get_config_value( clp_config, "archive_output.storage.directory", str(clp_package_dir / "var" / "data" / "archives"), ) + env_vars["CLP_PACKAGE_ARCHIVES"] = clp_archives_dir credentials_file_path = clp_package_dir / "etc" / "credentials.yml" with open(credentials_file_path, "r") as credentials_file: @@ -75,26 +106,39 @@ def main(argv=None) -> int: logger.error( "database.user and database.password must be specified in '%s'.", credentials_file_path ) - return 1 + return False + env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER"] = database_user + env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD"] = ( + database_password + ) - with open(output_file, "w") as env_file: - env_file.write( - "PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL" - f"=jdbc:mysql://{database_host}:{database_port}\n" - ) - env_file.write( - f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME={database_name}\n" - ) - env_file.write( - f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER={database_user}\n" + return True + + +def _add_worker_env_vars(coordinator_common_env_file_path: Path, env_vars: Dict[str, str]) -> bool: + """ + Adds environment variables for worker config values to `env_vars`. + + :param coordinator_common_env_file_path: + :param env_vars: + :return: Whether the environment variables were successfully added. + """ + config = dotenv_values(coordinator_common_env_file_path) + + try: + env_vars["PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI"] = ( + f'http://{config["PRESTO_COORDINATOR_SERVICENAME"]}' + f':{config["PRESTO_COORDINATOR_HTTPPORT"]}' ) - env_file.write( - f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD={database_password}\n" + except KeyError as e: + logger.error( + "Missing required key '%s' in '%s'", + e, + coordinator_common_env_file_path, ) - env_file.write(f"PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX=clp_\n") - env_file.write(f"CLP_PACKAGE_ARCHIVES={clp_archives_dir}\n") + return False - return 0 + return True def _get_config_value(config: dict, key: str, default_value: Optional[str] = None) -> str: diff --git a/tools/deployment/presto-clp/scripts/requirements.txt b/tools/deployment/presto-clp/scripts/requirements.txt index 5500f007d0..09eeec3cc9 100644 --- a/tools/deployment/presto-clp/scripts/requirements.txt +++ b/tools/deployment/presto-clp/scripts/requirements.txt @@ -1 +1,2 @@ +python-dotenv PyYAML diff --git a/tools/deployment/presto-clp/worker/config-template/config.properties b/tools/deployment/presto-clp/worker/config-template/config.properties index fdbf89d734..f183ec85bb 100644 --- a/tools/deployment/presto-clp/worker/config-template/config.properties +++ b/tools/deployment/presto-clp/worker/config-template/config.properties @@ -1,4 +1,4 @@ -discovery.uri=http://${PRESTO_COORDINATOR_SERVICENAME}:${PRESTO_COORDINATOR_HTTPPORT} +discovery.uri=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI} presto.version=REPLACE_ME http-server.http.port=${PRESTO_WORKER_HTTPPORT} shutdown-onset-sec=1 diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index f1b28ae179..b8a0ef28b6 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -22,35 +22,18 @@ mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog # Update "presto.version" parameter in config.properties file using values from coordinator CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" -# Retry configuration -MAX_RETRIES=30 -RETRY_DELAY=10 - -echo "Init container: Waiting for Presto to be ready..." - -# 1. Fetch version info from Presto with retry logic -retry_count=0 -while [ $retry_count -lt $MAX_RETRIES ]; do - echo "Attempt $((retry_count + 1))/$MAX_RETRIES - Checking Presto availability..." - - # Try to fetch the response - DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ {print $2}' "${PRESTO_CONFIG_DIR}/config.properties") - if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then - # Check if response is not empty and contains version info - if [ -n "$response" ] && echo "$response" | grep -q '"version"'; then - echo "Presto is ready!" - break - fi +# 1. Fetch version info from Presto +DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ {print $2}' "${PRESTO_CONFIG_DIR}/config.properties") +if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then + # Check if response is not empty and contains version info + if [ -n "$response" ] && echo "$response" | grep -q '"version"'; then + echo "Presto is ready!" + else + echo "Error: Presto response is empty or doesn't contain version info." + exit 1 fi - - echo "Presto not ready yet, retrying in ${RETRY_DELAY}s..." - sleep $RETRY_DELAY - retry_count=$((retry_count + 1)) -done - -# Check if we exceeded max retries -if [ $retry_count -eq $MAX_RETRIES ]; then - echo "Error: Presto did not become ready after $MAX_RETRIES attempts" +else + echo "Error: Couldn't get Presto version info." exit 1 fi From 91040d1e2a4be94c9342ea29b7f81fc297da84f2 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 00:51:29 -0400 Subject: [PATCH 09/42] Use jq to parse Presto version info. --- .../presto-clp/worker/scripts/generate-configs.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index b8a0ef28b6..72e0110a65 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -1,7 +1,6 @@ #!/bin/sh -# Install wget -apt-get update && apt-get install -y wget +apt-get update && apt-get install --assume-yes --no-install-recommends jq wget PRESTO_CONFIG_DIR="/opt/presto-server/etc" @@ -25,8 +24,8 @@ CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" # 1. Fetch version info from Presto DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ {print $2}' "${PRESTO_CONFIG_DIR}/config.properties") if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then - # Check if response is not empty and contains version info - if [ -n "$response" ] && echo "$response" | grep -q '"version"'; then + version=$(echo "$response" | jq --raw-output '.nodeVersion.version') + if [ "$version" != "null" ]; then echo "Presto is ready!" else echo "Error: Presto response is empty or doesn't contain version info." From 9ec648cc3ce97fe76b5614418421d9d20c0284d9 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 01:25:05 -0400 Subject: [PATCH 10/42] Clean-up wget command. --- .../deployment/presto-clp/worker/scripts/generate-configs.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 72e0110a65..9a4ce9f472 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -23,7 +23,9 @@ CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" # 1. Fetch version info from Presto DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ {print $2}' "${PRESTO_CONFIG_DIR}/config.properties") -if response=$(wget -qO- --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null); then +if response=$( + wget --quiet --output-document - --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null +); then version=$(echo "$response" | jq --raw-output '.nodeVersion.version') if [ "$version" != "null" ]; then echo "Presto is ready!" From 40974785641d8ef825b977583d54ca471e74a146 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 01:26:42 -0400 Subject: [PATCH 11/42] Use /usr/bin/env. --- tools/deployment/presto-clp/worker/scripts/generate-configs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 9a4ce9f472..656a1b05e5 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -1,4 +1,4 @@ -#!/bin/sh +#!/usr/bin/env sh apt-get update && apt-get install --assume-yes --no-install-recommends jq wget From b1ce1353cbed15981f719c686b461d4823c24760 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 05:36:52 -0400 Subject: [PATCH 12/42] Use function to update kv-pairs in config file. Set kv-pairs if they don't exist. Remove placeholder properties from property files. --- .../worker/config-template/config.properties | 1 - .../worker/config-template/node.properties | 2 - .../worker/scripts/generate-configs.sh | 53 ++++++++----------- 3 files changed, 23 insertions(+), 33 deletions(-) diff --git a/tools/deployment/presto-clp/worker/config-template/config.properties b/tools/deployment/presto-clp/worker/config-template/config.properties index f183ec85bb..5acabc5c47 100644 --- a/tools/deployment/presto-clp/worker/config-template/config.properties +++ b/tools/deployment/presto-clp/worker/config-template/config.properties @@ -1,5 +1,4 @@ discovery.uri=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI} -presto.version=REPLACE_ME http-server.http.port=${PRESTO_WORKER_HTTPPORT} shutdown-onset-sec=1 register-test-functions=false diff --git a/tools/deployment/presto-clp/worker/config-template/node.properties b/tools/deployment/presto-clp/worker/config-template/node.properties index 7e0e1e6b5d..ab9f51ba65 100644 --- a/tools/deployment/presto-clp/worker/config-template/node.properties +++ b/tools/deployment/presto-clp/worker/config-template/node.properties @@ -1,4 +1,2 @@ node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} -node.internal-address=REPLACE_ME node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION} -node.id=REPLACE_ME diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 656a1b05e5..7d68bebb76 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -1,4 +1,22 @@ -#!/usr/bin/env sh +#!/usr/bin/env bash + +# Sets/updates the given kv-pair in the given properties file. +# +# @param $1 The properties file. +# @param $2 The key to set. +# @param $3 The value to set. +update_config_file() { + local file_path=$1 + local key=$2 + local value=$3 + + if grep --quiet "^${key}=.*$" "$file_path"; then + sed --in-place "s|^${key}=.*|${key}=${value}|" "$file_path" + else + echo "${key}=${value}" >> "$file_path" + fi + echo "Set ${key}=${value} in ${file_path}" +} apt-get update && apt-get install --assume-yes --no-install-recommends jq wget @@ -43,34 +61,9 @@ version=$(echo "$response" | grep -o '"version":"[^"]*"' | sed 's/"version":"//; echo "Detected Presto version: $version" -# 3. Replace `presto.version=REPLACE_ME` with actual version in the config file -if grep -q '^presto.version=REPLACE_ME' "$CONFIG_PROPERTIES_FILE"; then - sed -i "s|^presto.version=REPLACE_ME|presto.version=$version|" "$CONFIG_PROPERTIES_FILE" - echo "Updated $CONFIG_PROPERTIES_FILE with version $version" -else - echo "Warning: 'presto.version=REPLACE_ME' not found in $CONFIG_PROPERTIES_FILE" - exit 1 -fi +update_config_file "$CONFIG_PROPERTIES_FILE" "presto.version" "$version" -# Modify node.properties +# Update node.properties NODE_PROPERTIES_FILE="/opt/presto-server/etc/node.properties" -INTERNAL_ADDRESS=$(hostname -i) -# Replace `node.internal-address=REPLACE_ME` with actual ip address in the config file -if grep -q '^node.internal-address=REPLACE_ME' "$NODE_PROPERTIES_FILE"; then - sed -i \ - "s|^node.internal-address=REPLACE_ME|node.internal-address=${INTERNAL_ADDRESS}|" \ - "$NODE_PROPERTIES_FILE" - echo "Updated $NODE_PROPERTIES_FILE with node.internal-address ${INTERNAL_ADDRESS}" -else - echo "Warning: 'node.internal-address=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" - exit 1 -fi - -# Replace `node.id=REPLACE_ME` with actual hostname in the config file -if grep -q '^node.id=REPLACE_ME' "$NODE_PROPERTIES_FILE"; then - sed -i "s|^node.id=REPLACE_ME|node.id=$(hostname)|" "$NODE_PROPERTIES_FILE" - echo "Updated $NODE_PROPERTIES_FILE with node.id $(hostname)" -else - echo "Warning: 'node.id=REPLACE_ME' not found in $NODE_PROPERTIES_FILE" - exit 1 -fi +update_config_file "$NODE_PROPERTIES_FILE" "node.internal-address" "$(hostname -i)" +update_config_file "$NODE_PROPERTIES_FILE" "node.id" "$(hostname)" From 70a56d7b9e520c92213af2caed449b91ca4a4991 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 05:51:33 -0400 Subject: [PATCH 13/42] Move getting Presto coordinator version into a function. --- .../worker/scripts/generate-configs.sh | 54 ++++++++++--------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 7d68bebb76..e39cdee4aa 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -2,7 +2,7 @@ # Sets/updates the given kv-pair in the given properties file. # -# @param $1 The properties file. +# @param $1 Path to the properties file. # @param $2 The key to set. # @param $3 The value to set. update_config_file() { @@ -13,11 +13,36 @@ update_config_file() { if grep --quiet "^${key}=.*$" "$file_path"; then sed --in-place "s|^${key}=.*|${key}=${value}|" "$file_path" else - echo "${key}=${value}" >> "$file_path" + echo "${key}=${value}" >>"$file_path" fi echo "Set ${key}=${value} in ${file_path}" } +# Gets the Presto coordinator's version or exits on failure. +# +# @param $1 Path to the config.properties file. +# @return The Presto version. +get_coordinator_version() { + local config_properties_file=$1 + + local discovery_uri + discovery_uri=$(awk -F= '/^discovery.uri=/ {print $2}' "$config_properties_file") + if response=$( + wget --quiet --output-document - --timeout=10 "${discovery_uri}/v1/info" 2>/dev/null + ); then + version=$(echo "$response" | jq --raw-output '.nodeVersion.version') + if [[ "$version" = "null" ]]; then + echo "Error: Presto response is empty or doesn't contain version info." + exit 1 + fi + else + echo "Error: Couldn't get Presto version info." + exit 1 + fi + + echo "$version" +} + apt-get update && apt-get install --assume-yes --no-install-recommends jq wget PRESTO_CONFIG_DIR="/opt/presto-server/etc" @@ -36,31 +61,10 @@ rm -f ${PRESTO_CONFIG_DIR}/catalog/* mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog -# Update "presto.version" parameter in config.properties file using values from coordinator +# Update config.properties CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" - -# 1. Fetch version info from Presto -DISCOVERY_URI=$(awk -F= '/^discovery.uri=/ {print $2}' "${PRESTO_CONFIG_DIR}/config.properties") -if response=$( - wget --quiet --output-document - --timeout=10 "${DISCOVERY_URI}/v1/info" 2>/dev/null -); then - version=$(echo "$response" | jq --raw-output '.nodeVersion.version') - if [ "$version" != "null" ]; then - echo "Presto is ready!" - else - echo "Error: Presto response is empty or doesn't contain version info." - exit 1 - fi -else - echo "Error: Couldn't get Presto version info." - exit 1 -fi - -# 2. Extract the version using grep and sed (busybox compatible) -version=$(echo "$response" | grep -o '"version":"[^"]*"' | sed 's/"version":"//;s/"//') - +version=$(get_coordinator_version "$CONFIG_PROPERTIES_FILE") echo "Detected Presto version: $version" - update_config_file "$CONFIG_PROPERTIES_FILE" "presto.version" "$version" # Update node.properties From 99ac4e13336aff266916ac369542c73dc40cb93c Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:00:17 -0400 Subject: [PATCH 14/42] Minor edits for consistency. --- .../deployment/presto-clp/worker/scripts/generate-configs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index e39cdee4aa..d669a2ad9a 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -26,9 +26,9 @@ get_coordinator_version() { local config_properties_file=$1 local discovery_uri - discovery_uri=$(awk -F= '/^discovery.uri=/ {print $2}' "$config_properties_file") + discovery_uri=$(awk -F "=" '/^discovery.uri=/ {print $2}' "$config_properties_file") if response=$( - wget --quiet --output-document - --timeout=10 "${discovery_uri}/v1/info" 2>/dev/null + wget --quiet --output-document - --timeout 10 "${discovery_uri}/v1/info" 2>/dev/null ); then version=$(echo "$response" | jq --raw-output '.nodeVersion.version') if [[ "$version" = "null" ]]; then From 6dde297cf49f4ea4403aad6d85796c280a47c166 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:00:41 -0400 Subject: [PATCH 15/42] fix: Set error policies. --- tools/deployment/presto-clp/worker/scripts/generate-configs.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index d669a2ad9a..1b7c676a0d 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -1,5 +1,8 @@ #!/usr/bin/env bash +set -eu +set -o pipefail + # Sets/updates the given kv-pair in the given properties file. # # @param $1 Path to the properties file. From b22746dbd1d0332eb3cc1aa127a16f7f18a80dc6 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:01:58 -0400 Subject: [PATCH 16/42] Mark constants readonly. --- .../presto-clp/worker/scripts/generate-configs.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 1b7c676a0d..1aab8c6279 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -48,7 +48,7 @@ get_coordinator_version() { apt-get update && apt-get install --assume-yes --no-install-recommends jq wget -PRESTO_CONFIG_DIR="/opt/presto-server/etc" +readonly PRESTO_CONFIG_DIR="/opt/presto-server/etc" # Substitute environemnt variables in config template find /configs -type f | while read -r f; do @@ -65,12 +65,12 @@ rm -f ${PRESTO_CONFIG_DIR}/catalog/* mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog # Update config.properties -CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" +readonly CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" version=$(get_coordinator_version "$CONFIG_PROPERTIES_FILE") echo "Detected Presto version: $version" update_config_file "$CONFIG_PROPERTIES_FILE" "presto.version" "$version" # Update node.properties -NODE_PROPERTIES_FILE="/opt/presto-server/etc/node.properties" +readonly NODE_PROPERTIES_FILE="/opt/presto-server/etc/node.properties" update_config_file "$NODE_PROPERTIES_FILE" "node.internal-address" "$(hostname -i)" update_config_file "$NODE_PROPERTIES_FILE" "node.id" "$(hostname)" From 708756d2a3cca04a724922c323212e2263030730 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:03:46 -0400 Subject: [PATCH 17/42] Clean-up comments. --- .../deployment/presto-clp/worker/scripts/generate-configs.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 1aab8c6279..3f796a15f3 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -50,7 +50,7 @@ apt-get update && apt-get install --assume-yes --no-install-recommends jq wget readonly PRESTO_CONFIG_DIR="/opt/presto-server/etc" -# Substitute environemnt variables in config template +# Substitute environment variables in config template find /configs -type f | while read -r f; do ( echo "cat <"${PRESTO_CONFIG_DIR}/$(basename "$f")" done -# Setup the config directory hierarchy +# Remove existing catalog files that exist in the image and add the CLP catalog rm -f ${PRESTO_CONFIG_DIR}/catalog/* - mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog # Update config.properties From 2eeda5c662ab1b869bbe8ba6e2e3583447d5b887 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:05:26 -0400 Subject: [PATCH 18/42] Quote paths. --- .../deployment/presto-clp/worker/scripts/generate-configs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index 3f796a15f3..e97442c80a 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -60,8 +60,8 @@ find /configs -type f | while read -r f; do done # Remove existing catalog files that exist in the image and add the CLP catalog -rm -f ${PRESTO_CONFIG_DIR}/catalog/* -mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog +rm -f "${PRESTO_CONFIG_DIR}/catalog/"* +mv "${PRESTO_CONFIG_DIR}/clp.properties" "${PRESTO_CONFIG_DIR}/catalog" # Update config.properties readonly CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" From 8bb98f870013286fb10b2b03dc9fc553699e6085 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:06:40 -0400 Subject: [PATCH 19/42] Clean-up presto-clp/coordinator/scripts/generate-configs.sh. --- .../coordinator/scripts/generate-configs.sh | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh b/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh index 326993ba09..511881a22a 100755 --- a/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/coordinator/scripts/generate-configs.sh @@ -1,17 +1,19 @@ -#!/bin/sh +#!/usr/bin/env bash -# Exit on error -set -e +set -eu +set -o pipefail -PRESTO_CONFIG_DIR="/opt/presto-server/etc" +readonly PRESTO_CONFIG_DIR="/opt/presto-server/etc" -# Substitute environemnt variables in config template +# Substitute environment variables in config template find /configs -type f | while read -r f; do - ( echo "cat < "${PRESTO_CONFIG_DIR}/$(basename "$f")" + ( + echo "cat <"${PRESTO_CONFIG_DIR}/$(basename "$f")" done -# Setup the config directory hierarchy -rm -f ${PRESTO_CONFIG_DIR}/catalog/* - -# Copy over files -mv ${PRESTO_CONFIG_DIR}/clp.properties ${PRESTO_CONFIG_DIR}/catalog +# Remove existing catalog files that exist in the image and add the CLP catalog +rm -f "${PRESTO_CONFIG_DIR}/catalog/"* +mv "${PRESTO_CONFIG_DIR}/clp.properties" "${PRESTO_CONFIG_DIR}/catalog" From 9fc487d4625e9dc3a5d55e449e6ebfa22d08abc3 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:07:21 -0400 Subject: [PATCH 20/42] Remove deprecated version property. --- tools/deployment/presto-clp/docker-compose.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml index 5d37b21e7f..5c084272f2 100644 --- a/tools/deployment/presto-clp/docker-compose.yaml +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -1,5 +1,3 @@ -version: "3.9" - services: presto-coordinator: image: "ghcr.io/y-scope/presto/coordinator:dev" From 64e8edffc9c9ddcf0f0fca4c74f63a5b1293734a Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:10:45 -0400 Subject: [PATCH 21/42] Alphabetize mounts. --- tools/deployment/presto-clp/docker-compose.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml index 5c084272f2..66d30bdc35 100644 --- a/tools/deployment/presto-clp/docker-compose.yaml +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -7,9 +7,9 @@ services: - "coordinator-common.env" - "coordinator.env" volumes: + - "./coordinator/config-template:/configs:ro" - "./coordinator/scripts:/scripts:ro" - "coordinator-config:/opt/presto-server/etc" - - "./coordinator/config-template:/configs:ro" networks: - "presto" healthcheck: @@ -32,10 +32,10 @@ services: - "coordinator-common.env" - "worker.env" volumes: - - "./worker/scripts:/scripts:ro" - - "worker-config:/opt/presto-server/etc" - "./worker/config-template:/configs:ro" + - "./worker/scripts:/scripts:ro" - "${CLP_PACKAGE_ARCHIVES}:${CLP_PACKAGE_ARCHIVES}" + - "worker-config:/opt/presto-server/etc" networks: - "presto" From 00bd3f1754a68a6633ed6e5491e83cd688b6d8a5 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:24:28 -0400 Subject: [PATCH 22/42] Rename environment variables for clarity. --- .../deployment/presto-clp/coordinator-common.env | 2 +- tools/deployment/presto-clp/coordinator.env | 12 ++++++------ .../coordinator/config-template/clp.properties | 14 +++++++------- .../config-template/config.properties | 6 +++--- .../coordinator/config-template/jvm.config | 2 +- .../coordinator/config-template/log.properties | 2 +- .../coordinator/config-template/node.properties | 2 +- tools/deployment/presto-clp/docker-compose.yaml | 4 ++-- .../scripts/generate-user-env-vars-file.py | 16 +++++++--------- tools/deployment/presto-clp/worker.env | 2 +- .../worker/config-template/config.properties | 2 +- .../worker/config-template/node.properties | 4 ++-- 12 files changed, 33 insertions(+), 35 deletions(-) diff --git a/tools/deployment/presto-clp/coordinator-common.env b/tools/deployment/presto-clp/coordinator-common.env index e746f6d5df..7b98bbdb7b 100644 --- a/tools/deployment/presto-clp/coordinator-common.env +++ b/tools/deployment/presto-clp/coordinator-common.env @@ -2,4 +2,4 @@ PRESTO_COORDINATOR_HTTPPORT="8080" PRESTO_COORDINATOR_SERVICENAME="presto-coordinator" # node.properties -PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT="production" +PRESTO_COORDINATOR_NODEPROPERTIES_ENVIRONMENT="production" diff --git a/tools/deployment/presto-clp/coordinator.env b/tools/deployment/presto-clp/coordinator.env index 521a80feab..fd6edb07d1 100644 --- a/tools/deployment/presto-clp/coordinator.env +++ b/tools/deployment/presto-clp/coordinator.env @@ -1,14 +1,14 @@ # clp.properties -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE="mysql" -PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER="mysql" +PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_PROVIDER_TYPE="mysql" +PRESTO_COORDINATOR_CLPPROPERTIES_SPLIT_PROVIDER="mysql" # config.properties -PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY="1GB" -PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE="1GB" +PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY="1GB" +PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY_PER_NODE="1GB" # jvm.config PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G" -PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE="32M" +PRESTO_COORDINATOR_JVMCONFIG_G1HEAPREGIONSIZE="32M" # log.properties -PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL="DEBUG" +PRESTO_COORDINATOR_LOGPROPERTIES_LEVEL="DEBUG" diff --git a/tools/deployment/presto-clp/coordinator/config-template/clp.properties b/tools/deployment/presto-clp/coordinator/config-template/clp.properties index cdf06e716a..cefee52d39 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/clp.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/clp.properties @@ -1,9 +1,9 @@ connector.name=clp -clp.metadata-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_PROVIDERTYPE} -clp.metadata-db-url=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL} -clp.metadata-db-name=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME} -clp.metadata-db-user=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER} -clp.metadata-db-password=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD} -clp.metadata-table-prefix=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX} -clp.split-provider-type=${PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_SPLITPROVIDER} +clp.metadata-provider-type=${PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_PROVIDER_TYPE} +clp.metadata-db-url=${PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_URL} +clp.metadata-db-name=${PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_NAME} +clp.metadata-db-user=${PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_USER} +clp.metadata-db-password=${PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_PASSWORD} +clp.metadata-table-prefix=${PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_TABLE_PREFIX} +clp.split-provider-type=${PRESTO_COORDINATOR_CLPPROPERTIES_SPLIT_PROVIDER} clp.metadata-filter-config=/opt/presto-server/etc/metadata-filter.json diff --git a/tools/deployment/presto-clp/coordinator/config-template/config.properties b/tools/deployment/presto-clp/coordinator/config-template/config.properties index eb65c2488f..b9da2234f4 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/config.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/config.properties @@ -1,10 +1,10 @@ coordinator=true node-scheduler.include-coordinator=false http-server.http.port=${PRESTO_COORDINATOR_HTTPPORT} -query.max-memory=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORY} -query.max-memory-per-node=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_QUERY_MAXMEMORYPERNODE} +query.max-memory=${PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY} +query.max-memory-per-node=${PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY_PER_NODE} discovery-server.enabled=true -discovery.uri=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI} +discovery.uri=${PRESTO_COORDINATOR_CONFIGPROPERTIES_DISCOVERY_URI} optimizer.optimize-hash-generation=false regex-library=RE2J use-alternative-function-signatures=true diff --git a/tools/deployment/presto-clp/coordinator/config-template/jvm.config b/tools/deployment/presto-clp/coordinator/config-template/jvm.config index 0b05b439ab..7a18a0a951 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/jvm.config +++ b/tools/deployment/presto-clp/coordinator/config-template/jvm.config @@ -1,7 +1,7 @@ -server -Xmx${PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE} -XX:+UseG1GC --XX:G1HeapRegionSize=${PRESTO_COORDINATOR_CONFIG_JVMCONFIG_G1HEAPREGIONSIZE} +-XX:G1HeapRegionSize=${PRESTO_COORDINATOR_JVMCONFIG_G1HEAPREGIONSIZE} -XX:+UseGCOverheadLimit -XX:+ExplicitGCInvokesConcurrent -XX:+HeapDumpOnOutOfMemoryError diff --git a/tools/deployment/presto-clp/coordinator/config-template/log.properties b/tools/deployment/presto-clp/coordinator/config-template/log.properties index 6578ca10f2..7e79c774f0 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/log.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/log.properties @@ -1 +1 @@ -com.facebook.presto=${PRESTO_COORDINATOR_CONFIG_LOGPROPERTIES_LEVEL} +com.facebook.presto=${PRESTO_COORDINATOR_LOGPROPERTIES_LEVEL} diff --git a/tools/deployment/presto-clp/coordinator/config-template/node.properties b/tools/deployment/presto-clp/coordinator/config-template/node.properties index 3397f6e9ec..dfde76b128 100644 --- a/tools/deployment/presto-clp/coordinator/config-template/node.properties +++ b/tools/deployment/presto-clp/coordinator/config-template/node.properties @@ -1,2 +1,2 @@ -node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} +node.environment=${PRESTO_COORDINATOR_NODEPROPERTIES_ENVIRONMENT} node.id=${PRESTO_COORDINATOR_SERVICENAME} diff --git a/tools/deployment/presto-clp/docker-compose.yaml b/tools/deployment/presto-clp/docker-compose.yaml index 66d30bdc35..0051ce4fbf 100644 --- a/tools/deployment/presto-clp/docker-compose.yaml +++ b/tools/deployment/presto-clp/docker-compose.yaml @@ -17,7 +17,7 @@ services: - "CMD" - "curl" - "-f" - - "${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI}/v1/info" + - "${PRESTO_COORDINATOR_CONFIGPROPERTIES_DISCOVERY_URI}/v1/info" interval: "10s" retries: 30 @@ -34,7 +34,7 @@ services: volumes: - "./worker/config-template:/configs:ro" - "./worker/scripts:/scripts:ro" - - "${CLP_PACKAGE_ARCHIVES}:${CLP_PACKAGE_ARCHIVES}" + - "${CLP_ARCHIVES_DIR}:${CLP_ARCHIVES_DIR}" - "worker-config:/opt/presto-server/etc" networks: - "presto" diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py index c3ba339545..453c7bdc72 100644 --- a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -64,7 +64,7 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: :param env_vars: :return: Whether the environment variables were successfully added. """ - env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_TABLEPREFIX"] = "clp_" + env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_TABLE_PREFIX"] = "clp_" clp_config_file_path = clp_package_dir / "etc" / "clp-config.yml" with open(clp_config_file_path, "r") as clp_config_file: @@ -73,10 +73,10 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: database_host = _get_config_value(clp_config, "database.host", "localhost") database_port = _get_config_value(clp_config, "database.port", str(3306)) database_name = _get_config_value(clp_config, "database.name", "clp-db") - env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_URL"] = ( + env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_URL"] = ( f"=jdbc:mysql://{database_host}:{database_port}" ) - env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_NAME"] = database_name + env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_NAME"] = database_name clp_archive_output_storage_type = _get_config_value( clp_config, "archive_output.storage.type", "fs" @@ -94,7 +94,7 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: "archive_output.storage.directory", str(clp_package_dir / "var" / "data" / "archives"), ) - env_vars["CLP_PACKAGE_ARCHIVES"] = clp_archives_dir + env_vars["CLP_ARCHIVES_DIR"] = clp_archives_dir credentials_file_path = clp_package_dir / "etc" / "credentials.yml" with open(credentials_file_path, "r") as credentials_file: @@ -107,10 +107,8 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: "database.user and database.password must be specified in '%s'.", credentials_file_path ) return False - env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_USER"] = database_user - env_vars["PRESTO_COORDINATOR_CONFIG_CLPPROPERTIES_METADATA_DATABASE_PASSWORD"] = ( - database_password - ) + env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_USER"] = database_user + env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_PASSWORD"] = database_password return True @@ -126,7 +124,7 @@ def _add_worker_env_vars(coordinator_common_env_file_path: Path, env_vars: Dict[ config = dotenv_values(coordinator_common_env_file_path) try: - env_vars["PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI"] = ( + env_vars["PRESTO_COORDINATOR_CONFIGPROPERTIES_DISCOVERY_URI"] = ( f'http://{config["PRESTO_COORDINATOR_SERVICENAME"]}' f':{config["PRESTO_COORDINATOR_HTTPPORT"]}' ) diff --git a/tools/deployment/presto-clp/worker.env b/tools/deployment/presto-clp/worker.env index 19ae8c58cd..fac2d97f76 100644 --- a/tools/deployment/presto-clp/worker.env +++ b/tools/deployment/presto-clp/worker.env @@ -1,4 +1,4 @@ PRESTO_WORKER_HTTPPORT="8080" # node.properties -PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION="worker-location" +PRESTO_WORKER_NODEPROPERTIES_LOCATION="worker-location" diff --git a/tools/deployment/presto-clp/worker/config-template/config.properties b/tools/deployment/presto-clp/worker/config-template/config.properties index 5acabc5c47..daa69ed366 100644 --- a/tools/deployment/presto-clp/worker/config-template/config.properties +++ b/tools/deployment/presto-clp/worker/config-template/config.properties @@ -1,4 +1,4 @@ -discovery.uri=${PRESTO_COORDINATOR_CONFIG_CONFIGPROPERTIES_DISCOVERY_URI} +discovery.uri=${PRESTO_COORDINATOR_CONFIGPROPERTIES_DISCOVERY_URI} http-server.http.port=${PRESTO_WORKER_HTTPPORT} shutdown-onset-sec=1 register-test-functions=false diff --git a/tools/deployment/presto-clp/worker/config-template/node.properties b/tools/deployment/presto-clp/worker/config-template/node.properties index ab9f51ba65..8b0fef4466 100644 --- a/tools/deployment/presto-clp/worker/config-template/node.properties +++ b/tools/deployment/presto-clp/worker/config-template/node.properties @@ -1,2 +1,2 @@ -node.environment=${PRESTO_COORDINATOR_CONFIG_NODEPROPERTIES_ENVIRONMENT} -node.location=${PRESTO_WORKER_CONFIG_NODEPROPERTIES_LOCATION} +node.environment=${PRESTO_COORDINATOR_NODEPROPERTIES_ENVIRONMENT} +node.location=${PRESTO_WORKER_NODEPROPERTIES_LOCATION} From 28f5b70f7835b5cd9f7d2be3b62f98f5e88be4df Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:28:24 -0400 Subject: [PATCH 23/42] fix: Remove spurious equals sign. --- .../presto-clp/scripts/generate-user-env-vars-file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py index 453c7bdc72..44e85f256b 100644 --- a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -74,7 +74,7 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: database_port = _get_config_value(clp_config, "database.port", str(3306)) database_name = _get_config_value(clp_config, "database.name", "clp-db") env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_URL"] = ( - f"=jdbc:mysql://{database_host}:{database_port}" + f"jdbc:mysql://{database_host}:{database_port}" ) env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_DATABASE_NAME"] = database_name From cccf9fe9412dd23cb4ad5ca2b3be21f5252540d3 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 06:54:33 -0400 Subject: [PATCH 24/42] Lint set-up-config.sh. --- tools/deployment/presto-clp/scripts/set-up-config.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/deployment/presto-clp/scripts/set-up-config.sh b/tools/deployment/presto-clp/scripts/set-up-config.sh index 7512001f42..6c6fb8cced 100755 --- a/tools/deployment/presto-clp/scripts/set-up-config.sh +++ b/tools/deployment/presto-clp/scripts/set-up-config.sh @@ -3,10 +3,10 @@ set -eu set -o pipefail -script_dir=$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +script_dir=$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd) cUsage="Usage: ${BASH_SOURCE[0]} " -if [ "$#" -lt 1 ] ; then +if [ "$#" -lt 1 ]; then echo "$cUsage" exit fi From 04f3d2a931ce07f62cf0aa5f75029f8e33b02ae1 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:15:01 -0400 Subject: [PATCH 25/42] Add docs and remove README. --- docs/src/user-guide/guides-using-presto.md | 115 +++++++++++++++++++++ tools/deployment/presto-clp/README.md | 73 ------------- 2 files changed, 115 insertions(+), 73 deletions(-) create mode 100644 docs/src/user-guide/guides-using-presto.md delete mode 100644 tools/deployment/presto-clp/README.md diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md new file mode 100644 index 0000000000..32b74a74a5 --- /dev/null +++ b/docs/src/user-guide/guides-using-presto.md @@ -0,0 +1,115 @@ +# Using Presto with CLP + +[Presto] is a distributed SQL query engine that can be used to query data stored in CLP (using SQL). +This guide describes how to set up and use Presto with CLP. + +:::{warning} +Currently, only the [clp-json](quick-start/clp-json.md) flavor of CLP supports queries through +Presto. +::: + +:::{note} +Currently, this integration with Presto is under development and may change in the future. It is +also being maintained in a [fork][yscope-presto] of the Presto project. We are working on merging +these changes into the main Presto repository so that you can use official Presto releases with CLP. +::: + +## Requirements + +* [Docker] v28 or higher +* [Docker Compose][docker-compose] v2.20.2 or higher +* Python +* python3-venv (for the version of Python installed) + +## Set up + +Using Presto with CLP requires: + +* [Setting up CLP](#setting-up-clp) and compressing some logs. +* [Setting up Presto](#setting-up-presto) to query CLP's metadata database and archives. + +### Setting up CLP + +Follow the [quick-start](./quick-start/index.md) guide to set up CLP and compress your logs. A +sample dataset that works well with Presto is the [postgresql] dataset. + +### Setting up Presto + +1. Navigate to the `tools/deployment/presto-clp` directory in your terminal. +2. Run the following script to generate the necessary config for Presto to work with CLP: + + ```bash + scripts/set-up-config.sh + ``` + + * `` is the location of the clp-json package you set up in the previous section. + +3. Start a Presto cluster by running: + + ```bash + docker compose up + ``` + + * To use more than Presto worker, you can use the `--scale` option as follows: + + ```bash + docker compose up --scale presto-worker= + ``` + + * `` is the number of Presto worker nodes you want to run. + +### Stopping the Presto cluster + +To stop the Presto cluster, use CTRL + C. + +If you want to clean up the Presto cluster entirely: + +```bash +docker compose rm +``` + +## Querying your logs through Presto + +To query your logs through Presto, you can use the Presto CLI: + +```bash +docker compose exec presto-coordinator \ + presto-cli \ + --catalog clp \ + --schema default +``` + +Each dataset in CLP shows up as a table in Presto. To show all available datasets: + +```sql +SHOW TABLES; +``` + +If you didn't specify a dataset when compressing your logs in CLP, your logs will have been stored +in the `default` dataset. To query the logs in this dataset: + +```sql +SELECT * FROM default LIMIT 1; +``` + +All kv-pairs in each log event can be queried directly using dot-notation. For example, if your logs +contain the field `foo.bar`, you can query it using: + +```sql +SELECT foo.bar FROM default LIMIT 1; +``` + +## Limitations + +The Presto CLP integration has one notable limitation at present. Nested-fields that contain special +characters (e.g., `t.$date`, where `$` is considered a special character by Presto) cannot be +queried (see [y-scope/presto#8]). To get around this limitation, you will need to preprocess your +logs to remove such special characters. This limitation will be addressed in a future release of the +Presto integration. + +[docker-compose]: https://docs.docker.com/compose/install/ +[Docker]: https://docs.docker.com/engine/install/ +[postgresql]: https://zenodo.org/records/10516401 +[Presto]: https://prestodb.io/ +[y-scope/presto#8]: https://github.com/y-scope/presto/issues/8 +[yscope-presto]: https://github.com/y-scope/presto diff --git a/tools/deployment/presto-clp/README.md b/tools/deployment/presto-clp/README.md deleted file mode 100644 index e35599ee4a..0000000000 --- a/tools/deployment/presto-clp/README.md +++ /dev/null @@ -1,73 +0,0 @@ -# Setup local docker stack for presto + clp - -## Install docker - -Follow the guide here: [docker] - -# Launch clp-package - -1. Find the clp-package for test on our official website [clp-json-v0.4.0]. Here is a sample dataset for demo testing: [postgresql dataset]. - -2. Untar the clp-package and the postgresql dataset. - -3. Launch: - -```bash -# You probably want to run a python 3.9 or newer virtual environment -sbin/start-clp.sh -``` - -5. Compress: - -```bash -# You can also use your own dataset -sbin/compress.sh --timestamp-key 'timestamp' /path/to/postgresql.log -``` - -6. Use the following command to update `.env`: - -```bash -scripts/set-up-config.sh /path/to/clp-json-package -``` - -# Create Docker Cluster - -Create a local docker stack: - -```bash -docker compose up -``` - -To create a docker stack with more than 1 worker (e.g., 3 workers): -``` -docker compose up --scale presto-worker=3 -``` - -# Use cli: - -After all containers are in "Started" states (check by `docker ps`): - -```bash -# On your host -docker exec -it compose-presto-coordinator-1 sh - -# In presto-coordinator container -/opt/presto-cli --catalog clp --schema default --server localhost:8080 -``` - -Example query: -```sql -SELECT * FROM default LIMIT 1; -``` - -# Delete docker Cluster - -```bash -docker compose down -``` - - - -[clp-json-v0.4.0]: https://github.com/y-scope/clp/releases/tag/v0.4.0 -[docker]: https://docs.docker.com/engine/install -[postgresql dataset]: https://zenodo.org/records/10516402 From 3381eb8465befdab7e7ba486eb1fc0335d0b76b3 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:15:11 -0400 Subject: [PATCH 26/42] Set coordinator log level to INFO. --- tools/deployment/presto-clp/coordinator.env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/deployment/presto-clp/coordinator.env b/tools/deployment/presto-clp/coordinator.env index fd6edb07d1..e169cacca2 100644 --- a/tools/deployment/presto-clp/coordinator.env +++ b/tools/deployment/presto-clp/coordinator.env @@ -11,4 +11,4 @@ PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G" PRESTO_COORDINATOR_JVMCONFIG_G1HEAPREGIONSIZE="32M" # log.properties -PRESTO_COORDINATOR_LOGPROPERTIES_LEVEL="DEBUG" +PRESTO_COORDINATOR_LOGPROPERTIES_LEVEL="INFO" From bea5bb6276b3faab15bd0f52785abc61d5132d1c Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:19:04 -0400 Subject: [PATCH 27/42] Validate CLP metadata database type. --- .../presto-clp/scripts/generate-user-env-vars-file.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py index 44e85f256b..c2efa6dcd4 100644 --- a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -70,6 +70,15 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: with open(clp_config_file_path, "r") as clp_config_file: clp_config = yaml.safe_load(clp_config_file) + database_type = _get_config_value(clp_config, "database.type", "mariadb") + if "mariadb" != database_type and "mysql" != database_type: + logger.error( + "CLP's database.type must be either mariadb or mysql but found '%s'. Presto" + " currently only supports reading metadata from a mariadb or mysql database.", + database_type, + ) + return False + database_host = _get_config_value(clp_config, "database.host", "localhost") database_port = _get_config_value(clp_config, "database.port", str(3306)) database_name = _get_config_value(clp_config, "database.name", "clp-db") From 1f6896569d916ff9b970f0088f799686cc0f372c Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:22:08 -0400 Subject: [PATCH 28/42] Use logging function rather than echos. --- .../worker/scripts/generate-configs.sh | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index e97442c80a..e97b9e4db9 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -3,6 +3,17 @@ set -eu set -o pipefail +# Emits a log event to stderr with an auto-generated ISO timestamp as well as the given level +# and message. +# +# @param $1: Level string +# @param $2: Message to be logged +log() { + local -r LEVEL=$1 + local -r MESSAGE=$2 + echo "$(date --utc --date="now" +"%Y-%m-%dT%H:%M:%SZ") [${LEVEL}] ${MESSAGE}" >&2 +} + # Sets/updates the given kv-pair in the given properties file. # # @param $1 Path to the properties file. @@ -18,7 +29,7 @@ update_config_file() { else echo "${key}=${value}" >>"$file_path" fi - echo "Set ${key}=${value} in ${file_path}" + log "INFO" "Set ${key}=${value} in ${file_path}" } # Gets the Presto coordinator's version or exits on failure. @@ -35,11 +46,11 @@ get_coordinator_version() { ); then version=$(echo "$response" | jq --raw-output '.nodeVersion.version') if [[ "$version" = "null" ]]; then - echo "Error: Presto response is empty or doesn't contain version info." + log "ERROR" "Presto response is empty or doesn't contain version info." exit 1 fi else - echo "Error: Couldn't get Presto version info." + log "ERROR" "Couldn't get Presto version info." exit 1 fi @@ -66,7 +77,7 @@ mv "${PRESTO_CONFIG_DIR}/clp.properties" "${PRESTO_CONFIG_DIR}/catalog" # Update config.properties readonly CONFIG_PROPERTIES_FILE="/opt/presto-server/etc/config.properties" version=$(get_coordinator_version "$CONFIG_PROPERTIES_FILE") -echo "Detected Presto version: $version" +log "INFO" "Detected Presto version: $version" update_config_file "$CONFIG_PROPERTIES_FILE" "presto.version" "$version" # Update node.properties From 94b0210feedfcac0a6c34bfe4d309fef3af21c25 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:23:21 -0400 Subject: [PATCH 29/42] Reorder functions. --- .../worker/scripts/generate-configs.sh | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh index e97b9e4db9..c04ae86182 100755 --- a/tools/deployment/presto-clp/worker/scripts/generate-configs.sh +++ b/tools/deployment/presto-clp/worker/scripts/generate-configs.sh @@ -14,24 +14,6 @@ log() { echo "$(date --utc --date="now" +"%Y-%m-%dT%H:%M:%SZ") [${LEVEL}] ${MESSAGE}" >&2 } -# Sets/updates the given kv-pair in the given properties file. -# -# @param $1 Path to the properties file. -# @param $2 The key to set. -# @param $3 The value to set. -update_config_file() { - local file_path=$1 - local key=$2 - local value=$3 - - if grep --quiet "^${key}=.*$" "$file_path"; then - sed --in-place "s|^${key}=.*|${key}=${value}|" "$file_path" - else - echo "${key}=${value}" >>"$file_path" - fi - log "INFO" "Set ${key}=${value} in ${file_path}" -} - # Gets the Presto coordinator's version or exits on failure. # # @param $1 Path to the config.properties file. @@ -57,6 +39,24 @@ get_coordinator_version() { echo "$version" } +# Sets/updates the given kv-pair in the given properties file. +# +# @param $1 Path to the properties file. +# @param $2 The key to set. +# @param $3 The value to set. +update_config_file() { + local file_path=$1 + local key=$2 + local value=$3 + + if grep --quiet "^${key}=.*$" "$file_path"; then + sed --in-place "s|^${key}=.*|${key}=${value}|" "$file_path" + else + echo "${key}=${value}" >>"$file_path" + fi + log "INFO" "Set ${key}=${value} in ${file_path}" +} + apt-get update && apt-get install --assume-yes --no-install-recommends jq wget readonly PRESTO_CONFIG_DIR="/opt/presto-server/etc" From 22c53e05a9e2892ba405fcc5e4798492a60d1ca3 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:26:56 -0400 Subject: [PATCH 30/42] Add new docs to index. --- docs/src/user-guide/guides-overview.md | 7 +++++++ docs/src/user-guide/index.md | 1 + 2 files changed, 8 insertions(+) diff --git a/docs/src/user-guide/guides-overview.md b/docs/src/user-guide/guides-overview.md index 02faefe2a1..20d26e0ab0 100644 --- a/docs/src/user-guide/guides-overview.md +++ b/docs/src/user-guide/guides-overview.md @@ -12,6 +12,13 @@ Using object storage Using CLP to ingest logs from object storage and store archives on object storage. ::: +:::{grid-item-card} +:link: guides-using-presto +Using Presto with CLP +^^^ +How to use Presto to query compressed logs in CLP. +::: + :::{grid-item-card} :link: guides-multi-node Multi-node deployment diff --git a/docs/src/user-guide/index.md b/docs/src/user-guide/index.md index a637c8a650..4e4efb7479 100644 --- a/docs/src/user-guide/index.md +++ b/docs/src/user-guide/index.md @@ -62,6 +62,7 @@ quick-start/clp-text guides-overview guides-using-object-storage/index guides-multi-node +guides-using-presto ::: :::{toctree} From 2f0de288e96ac46bedf2e268ec597007aacb6d10 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:27:05 -0400 Subject: [PATCH 31/42] Add S3 limitation. --- docs/src/user-guide/guides-using-presto.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 32b74a74a5..8791f2cbde 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -101,11 +101,14 @@ SELECT foo.bar FROM default LIMIT 1; ## Limitations -The Presto CLP integration has one notable limitation at present. Nested-fields that contain special -characters (e.g., `t.$date`, where `$` is considered a special character by Presto) cannot be -queried (see [y-scope/presto#8]). To get around this limitation, you will need to preprocess your -logs to remove such special characters. This limitation will be addressed in a future release of the -Presto integration. +The Presto CLP integration the following limitations at present: + +* Nested-fields that contain special characters (e.g., `t.$date`, where `$` is considered a special + character by Presto) cannot be queried (see [y-scope/presto#8]). To get around this limitation, + you will need to preprocess your logs to remove such special characters. +* Only logs stored on the filesystem, rather than S3, can be queried through Presto. + +These limitations will be addressed in a future release of the Presto integration. [docker-compose]: https://docs.docker.com/compose/install/ [Docker]: https://docs.docker.com/engine/install/ From d621bf39c0066bf9102aaf2676fc8fe60ad29941 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:53:18 -0400 Subject: [PATCH 32/42] Add clone step to docs. --- docs/src/user-guide/guides-using-presto.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 8791f2cbde..68ab73ec67 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -35,8 +35,14 @@ sample dataset that works well with Presto is the [postgresql] dataset. ### Setting up Presto -1. Navigate to the `tools/deployment/presto-clp` directory in your terminal. -2. Run the following script to generate the necessary config for Presto to work with CLP: +1. Clone the CLP repository: + + ```bash + git clone https://github.com/y-scope/clp.git + ``` + +2. Navigate to the `tools/deployment/presto-clp` directory in your terminal. +3. Run the following script to generate the necessary config for Presto to work with CLP: ```bash scripts/set-up-config.sh @@ -44,7 +50,7 @@ sample dataset that works well with Presto is the [postgresql] dataset. * `` is the location of the clp-json package you set up in the previous section. -3. Start a Presto cluster by running: +4. Start a Presto cluster by running: ```bash docker compose up From 7300d202a43631d9e229626cbf267db7cd95aee9 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:56:36 -0400 Subject: [PATCH 33/42] Add required CLP version to docs. --- docs/src/user-guide/guides-using-presto.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 68ab73ec67..8ec8e9284a 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -16,6 +16,7 @@ these changes into the main Presto repository so that you can use official Prest ## Requirements +* [CLP][clp-releases] (clp-json) v0.4.0 or higher * [Docker] v28 or higher * [Docker Compose][docker-compose] v2.20.2 or higher * Python @@ -116,6 +117,7 @@ The Presto CLP integration the following limitations at present: These limitations will be addressed in a future release of the Presto integration. +[clp-releases]: https://github.com/y-scope/clp/releases [docker-compose]: https://docs.docker.com/compose/install/ [Docker]: https://docs.docker.com/engine/install/ [postgresql]: https://zenodo.org/records/10516401 From 4bc2f5891a59c85e3c0eb58d15f44198ab74c9c9 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 07:56:48 -0400 Subject: [PATCH 34/42] Rename PRESTO_WORKER_HTTPPORT. --- tools/deployment/presto-clp/worker.env | 2 +- .../presto-clp/worker/config-template/config.properties | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/deployment/presto-clp/worker.env b/tools/deployment/presto-clp/worker.env index fac2d97f76..67f3e135b0 100644 --- a/tools/deployment/presto-clp/worker.env +++ b/tools/deployment/presto-clp/worker.env @@ -1,4 +1,4 @@ -PRESTO_WORKER_HTTPPORT="8080" +PRESTO_WORKER_HTTP_PORT="8080" # node.properties PRESTO_WORKER_NODEPROPERTIES_LOCATION="worker-location" diff --git a/tools/deployment/presto-clp/worker/config-template/config.properties b/tools/deployment/presto-clp/worker/config-template/config.properties index daa69ed366..ca89892d4e 100644 --- a/tools/deployment/presto-clp/worker/config-template/config.properties +++ b/tools/deployment/presto-clp/worker/config-template/config.properties @@ -1,5 +1,5 @@ discovery.uri=${PRESTO_COORDINATOR_CONFIGPROPERTIES_DISCOVERY_URI} -http-server.http.port=${PRESTO_WORKER_HTTPPORT} +http-server.http.port=${PRESTO_WORKER_HTTP_PORT} shutdown-onset-sec=1 register-test-functions=false runtime-metrics-collection-enabled=false From 80c41d41a3c878f8077bafe2c03fc74608577850 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 08:04:21 -0400 Subject: [PATCH 35/42] Remove unnecessary quotes from env var files. --- tools/deployment/presto-clp/coordinator-common.env | 6 +++--- tools/deployment/presto-clp/coordinator.env | 14 +++++++------- tools/deployment/presto-clp/worker.env | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/deployment/presto-clp/coordinator-common.env b/tools/deployment/presto-clp/coordinator-common.env index 7b98bbdb7b..db2499da1f 100644 --- a/tools/deployment/presto-clp/coordinator-common.env +++ b/tools/deployment/presto-clp/coordinator-common.env @@ -1,5 +1,5 @@ -PRESTO_COORDINATOR_HTTPPORT="8080" -PRESTO_COORDINATOR_SERVICENAME="presto-coordinator" +PRESTO_COORDINATOR_HTTPPORT=8080 +PRESTO_COORDINATOR_SERVICENAME=presto-coordinator # node.properties -PRESTO_COORDINATOR_NODEPROPERTIES_ENVIRONMENT="production" +PRESTO_COORDINATOR_NODEPROPERTIES_ENVIRONMENT=production diff --git a/tools/deployment/presto-clp/coordinator.env b/tools/deployment/presto-clp/coordinator.env index e169cacca2..c246fc7003 100644 --- a/tools/deployment/presto-clp/coordinator.env +++ b/tools/deployment/presto-clp/coordinator.env @@ -1,14 +1,14 @@ # clp.properties -PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_PROVIDER_TYPE="mysql" -PRESTO_COORDINATOR_CLPPROPERTIES_SPLIT_PROVIDER="mysql" +PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_PROVIDER_TYPE=mysql +PRESTO_COORDINATOR_CLPPROPERTIES_SPLIT_PROVIDER=mysql # config.properties -PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY="1GB" -PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY_PER_NODE="1GB" +PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY=1GB +PRESTO_COORDINATOR_CONFIGPROPERTIES_QUERY_MAX_MEMORY_PER_NODE=1GB # jvm.config -PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE="4G" -PRESTO_COORDINATOR_JVMCONFIG_G1HEAPREGIONSIZE="32M" +PRESTO_COORDINATOR_CONFIG_JVMCONFIG_MAXHEAPSIZE=4G +PRESTO_COORDINATOR_JVMCONFIG_G1HEAPREGIONSIZE=32M # log.properties -PRESTO_COORDINATOR_LOGPROPERTIES_LEVEL="INFO" +PRESTO_COORDINATOR_LOGPROPERTIES_LEVEL=INFO diff --git a/tools/deployment/presto-clp/worker.env b/tools/deployment/presto-clp/worker.env index 67f3e135b0..33e4e178f9 100644 --- a/tools/deployment/presto-clp/worker.env +++ b/tools/deployment/presto-clp/worker.env @@ -1,4 +1,4 @@ -PRESTO_WORKER_HTTP_PORT="8080" +PRESTO_WORKER_HTTP_PORT=8080 # node.properties -PRESTO_WORKER_NODEPROPERTIES_LOCATION="worker-location" +PRESTO_WORKER_NODEPROPERTIES_LOCATION=worker-location From 9d2146be7236d93a399c079d753231f82f158b37 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 08:04:36 -0400 Subject: [PATCH 36/42] Address some rabbit feedback. --- docs/src/user-guide/guides-using-presto.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 8ec8e9284a..0bd6ff83aa 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -32,7 +32,7 @@ Using Presto with CLP requires: ### Setting up CLP Follow the [quick-start](./quick-start/index.md) guide to set up CLP and compress your logs. A -sample dataset that works well with Presto is the [postgresql] dataset. +sample dataset that works well with Presto is [postgresql]. ### Setting up Presto @@ -69,7 +69,7 @@ sample dataset that works well with Presto is the [postgresql] dataset. To stop the Presto cluster, use CTRL + C. -If you want to clean up the Presto cluster entirely: +To clean up the Presto cluster entirely: ```bash docker compose rm @@ -108,7 +108,7 @@ SELECT foo.bar FROM default LIMIT 1; ## Limitations -The Presto CLP integration the following limitations at present: +The Presto CLP integration has the following limitations at present: * Nested-fields that contain special characters (e.g., `t.$date`, where `$` is considered a special character by Presto) cannot be queried (see [y-scope/presto#8]). To get around this limitation, From ff06c622b718a0f4e89b1522491585954b64140f Mon Sep 17 00:00:00 2001 From: anlowee Date: Mon, 28 Jul 2025 13:07:42 +0000 Subject: [PATCH 37/42] Address the nested field limitation comment --- docs/src/user-guide/guides-using-presto.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 0bd6ff83aa..c7fc6b428f 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -110,9 +110,8 @@ SELECT foo.bar FROM default LIMIT 1; The Presto CLP integration has the following limitations at present: -* Nested-fields that contain special characters (e.g., `t.$date`, where `$` is considered a special - character by Presto) cannot be queried (see [y-scope/presto#8]). To get around this limitation, - you will need to preprocess your logs to remove such special characters. +* Nested-fields that contain special characters except `_` (see [y-scope/presto#8]). To get around +this limitation,you will need to preprocess your logs to remove such special characters. * Only logs stored on the filesystem, rather than S3, can be queried through Presto. These limitations will be addressed in a future release of the Presto integration. From cf116946624741baac0a7d6647bb34f54fa694fe Mon Sep 17 00:00:00 2001 From: anlowee Date: Mon, 28 Jul 2025 14:24:07 +0000 Subject: [PATCH 38/42] Add metadata filter config --- docs/src/user-guide/guides-using-presto.md | 59 ++++++++++++++++++++-- 1 file changed, 56 insertions(+), 3 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index c7fc6b428f..549ddcda2f 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -51,6 +51,57 @@ sample dataset that works well with Presto is [postgresql]. * `` is the location of the clp-json package you set up in the previous section. + Note that for the metadata filter config (i.e., + `tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json`), it is a config + to indicate which columns are used for filtering splits that will be processed by Presto. Here + is an example: + + ```json + { + "clp.default.default": [ + { + "columnName": "timestamp", + "rangeMapping": { + "lowerBound": "begin_timestamp", + "upperBound": "end_timestamp" + }, + "required": false + } + ] + } + ``` + + * `"clp.default.default"` is the filter's scope. A scope can be one of the following: + + * A catalog name + * A fully-qualified schema name + * A fully-qualified table name + + Filter configs under a particular scope will apply to all child scopes. For example, filter + configs at the schema level will apply to all tables within that schema. In this example, + the filter will only apply to the `default` table under the `default` schema of the `clp` + catalog. + + * `"columnName"` is the data column's name. You can use the column used as `--timestamp-key` + when compressing if you want to filter splits by timestamp. + + * `"rangeMapping"` is an optional object with the following properties: + + * `"lowerBound"` is the metadata column that represents the lower bound of values in a split + for the data column. + * `"upperBound"` is the metadata column that represents the upper bound of values in a split + for the data column. + + In this example, since in CLP's metadata database, for each split (i.e., archive) there are + two fields `begin_timestamp` and `end_timestamp` to store the earilest and latest timestamps + of the log messages compressed in that split, we have to remap the original data column's + name to these two fields so that it can query the metadata database to retrieve filtered + splits. + + * `"required"` is an optional field (defaults to false) which indicates whether the filter must + be present in the translated metadata filter SQL query. If a required filter is missing or + cannot be pushed down, the query will be rejected. + 4. Start a Presto cluster by running: ```bash @@ -93,7 +144,8 @@ SHOW TABLES; ``` If you didn't specify a dataset when compressing your logs in CLP, your logs will have been stored -in the `default` dataset. To query the logs in this dataset: +in the `default` dataset. If you also didn't specify any metadata filters, you can query the logs +in this dataset: ```sql SELECT * FROM default LIMIT 1; @@ -110,8 +162,9 @@ SELECT foo.bar FROM default LIMIT 1; The Presto CLP integration has the following limitations at present: -* Nested-fields that contain special characters except `_` (see [y-scope/presto#8]). To get around -this limitation,you will need to preprocess your logs to remove such special characters. +* Nested fields containing special characters (i.e., any non-alphanumeric characters except `_`; +see [y-scope/presto#8]). To get around this limitation,you will need to preprocess your logs to +remove such special characters. * Only logs stored on the filesystem, rather than S3, can be queried through Presto. These limitations will be addressed in a future release of the Presto integration. From 782b87d0bc63eedef0dee72999955c348515a3b2 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 15:56:20 -0400 Subject: [PATCH 39/42] Docs edits. --- docs/src/user-guide/guides-using-presto.md | 93 ++++++++-------------- 1 file changed, 34 insertions(+), 59 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 549ddcda2f..7ade53244b 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -9,9 +9,9 @@ Presto. ::: :::{note} -Currently, this integration with Presto is under development and may change in the future. It is -also being maintained in a [fork][yscope-presto] of the Presto project. We are working on merging -these changes into the main Presto repository so that you can use official Presto releases with CLP. +This integration with Presto is under development and may change in the future. It is also being +maintained in a [fork][yscope-presto] of the Presto project. At some point, these changes will have +been merged into the main Presto repository so that you can use official Presto releases with CLP. ::: ## Requirements @@ -51,58 +51,33 @@ sample dataset that works well with Presto is [postgresql]. * `` is the location of the clp-json package you set up in the previous section. - Note that for the metadata filter config (i.e., - `tools/deployment/presto-clp/coordinator/config-template/metadata-filter.json`), it is a config - to indicate which columns are used for filtering splits that will be processed by Presto. Here - is an example: - - ```json - { - "clp.default.default": [ - { - "columnName": "timestamp", - "rangeMapping": { - "lowerBound": "begin_timestamp", - "upperBound": "end_timestamp" - }, - "required": false - } - ] - } - ``` - - * `"clp.default.default"` is the filter's scope. A scope can be one of the following: - - * A catalog name - * A fully-qualified schema name - * A fully-qualified table name - - Filter configs under a particular scope will apply to all child scopes. For example, filter - configs at the schema level will apply to all tables within that schema. In this example, - the filter will only apply to the `default` table under the `default` schema of the `clp` - catalog. - - * `"columnName"` is the data column's name. You can use the column used as `--timestamp-key` - when compressing if you want to filter splits by timestamp. - - * `"rangeMapping"` is an optional object with the following properties: - - * `"lowerBound"` is the metadata column that represents the lower bound of values in a split - for the data column. - * `"upperBound"` is the metadata column that represents the upper bound of values in a split - for the data column. - - In this example, since in CLP's metadata database, for each split (i.e., archive) there are - two fields `begin_timestamp` and `end_timestamp` to store the earilest and latest timestamps - of the log messages compressed in that split, we have to remap the original data column's - name to these two fields so that it can query the metadata database to retrieve filtered - splits. - - * `"required"` is an optional field (defaults to false) which indicates whether the filter must - be present in the translated metadata filter SQL query. If a required filter is missing or - cannot be pushed down, the query will be rejected. +4. Configure Presto to use CLP's metadata database as follows: + + * Open and edit `coordinator/config-template/metadata-filter.json`. + * For each dataset you want to query, add a filter of the form: + + ```json + { + "clp.default.": [ + { + "columnName": "", + "rangeMapping": { + "lowerBound": "begin_timestamp", + "upperBound": "end_timestamp" + }, + "required": false + } + ] + } + ``` + + * `` is the name of the dataset you want to query. (If you didn't specify a dataset + when compressing your logs, they would be compressed into the `default` dataset.) + * `` is the timestamp key you specified when compressing logs for this + particular dataset. + * The complete syntax for this file is [here][clp-connector-docs]. -4. Start a Presto cluster by running: +5. Start a Presto cluster by running: ```bash docker compose up @@ -144,8 +119,7 @@ SHOW TABLES; ``` If you didn't specify a dataset when compressing your logs in CLP, your logs will have been stored -in the `default` dataset. If you also didn't specify any metadata filters, you can query the logs -in this dataset: +in the `default` dataset. To query the logs in this dataset: ```sql SELECT * FROM default LIMIT 1; @@ -162,13 +136,14 @@ SELECT foo.bar FROM default LIMIT 1; The Presto CLP integration has the following limitations at present: -* Nested fields containing special characters (i.e., any non-alphanumeric characters except `_`; -see [y-scope/presto#8]). To get around this limitation,you will need to preprocess your logs to -remove such special characters. +* Nested fields containing special characters cannot be queried (see [y-scope/presto#8]). Allowed + characters are alphanumeric characters and underscores. To get around this limitation, you'll + need to preprocess your logs to remove any special characters. * Only logs stored on the filesystem, rather than S3, can be queried through Presto. These limitations will be addressed in a future release of the Presto integration. +[clp-connector-docs]: https://docs.yscope.com/presto/connector/clp.html#metadata-filter-config-file [clp-releases]: https://github.com/y-scope/clp/releases [docker-compose]: https://docs.docker.com/compose/install/ [Docker]: https://docs.docker.com/engine/install/ From 8107518da42e6ace1f7cdd4bf5c876d1dcd5b98a Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 15:56:32 -0400 Subject: [PATCH 40/42] Add error checking for config files not existing. --- .../scripts/generate-user-env-vars-file.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py index c2efa6dcd4..4e4a8ac046 100644 --- a/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py +++ b/tools/deployment/presto-clp/scripts/generate-user-env-vars-file.py @@ -67,6 +67,14 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: env_vars["PRESTO_COORDINATOR_CLPPROPERTIES_METADATA_TABLE_PREFIX"] = "clp_" clp_config_file_path = clp_package_dir / "etc" / "clp-config.yml" + if not clp_config_file_path.exists(): + logger.error( + "'%s' doesn't exist. Is '%s' the location of the CLP package?", + clp_config_file_path, + clp_package_dir.resolve(), + ) + return False + with open(clp_config_file_path, "r") as clp_config_file: clp_config = yaml.safe_load(clp_config_file) @@ -106,6 +114,10 @@ def _add_clp_env_vars(clp_package_dir: Path, env_vars: Dict[str, str]) -> bool: env_vars["CLP_ARCHIVES_DIR"] = clp_archives_dir credentials_file_path = clp_package_dir / "etc" / "credentials.yml" + if not credentials_file_path.exists(): + logger.error("'%s' doesn't exist. Did you start CLP?", credentials_file_path) + return False + with open(credentials_file_path, "r") as credentials_file: credentials = yaml.safe_load(credentials_file) From 4d896e593a76bc7e132b754a0d5ef7700523c4fa Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 15:56:40 -0400 Subject: [PATCH 41/42] More docs edits. --- docs/src/user-guide/guides-using-presto.md | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 7ade53244b..8b8ed73454 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -43,18 +43,19 @@ sample dataset that works well with Presto is [postgresql]. ``` 2. Navigate to the `tools/deployment/presto-clp` directory in your terminal. -3. Run the following script to generate the necessary config for Presto to work with CLP: +3. Generate the necessary config for Presto to work with CLP: ```bash scripts/set-up-config.sh ``` - * `` is the location of the clp-json package you set up in the previous section. + * Replace `` with the location of the clp-json package you set up in the previous + section. 4. Configure Presto to use CLP's metadata database as follows: * Open and edit `coordinator/config-template/metadata-filter.json`. - * For each dataset you want to query, add a filter of the form: + * For each dataset you want to query, add a filter config of the form: ```json { @@ -71,10 +72,10 @@ sample dataset that works well with Presto is [postgresql]. } ``` - * `` is the name of the dataset you want to query. (If you didn't specify a dataset - when compressing your logs, they would be compressed into the `default` dataset.) - * `` is the timestamp key you specified when compressing logs for this - particular dataset. + * Replace `` with the name of the dataset you want to query. (If you didn't specify a + dataset when compressing your logs, they would be compressed into the `default` dataset.) + * Replace `` with the timestamp key you specified when compressing logs for + this particular dataset. * The complete syntax for this file is [here][clp-connector-docs]. 5. Start a Presto cluster by running: @@ -89,7 +90,7 @@ sample dataset that works well with Presto is [postgresql]. docker compose up --scale presto-worker= ``` - * `` is the number of Presto worker nodes you want to run. + * Replace `` with the number of Presto worker nodes you want to run. ### Stopping the Presto cluster From 713670a8bf4f4ba662701832d8900c9aae46a396 Mon Sep 17 00:00:00 2001 From: Kirk Rodrigues <2454684+kirkrodrigues@users.noreply.github.com> Date: Mon, 28 Jul 2025 16:17:54 -0400 Subject: [PATCH 42/42] Remove extra spaces. --- docs/src/user-guide/guides-using-presto.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/user-guide/guides-using-presto.md b/docs/src/user-guide/guides-using-presto.md index 8b8ed73454..59d464bc04 100644 --- a/docs/src/user-guide/guides-using-presto.md +++ b/docs/src/user-guide/guides-using-presto.md @@ -71,7 +71,7 @@ sample dataset that works well with Presto is [postgresql]. ] } ``` - + * Replace `` with the name of the dataset you want to query. (If you didn't specify a dataset when compressing your logs, they would be compressed into the `default` dataset.) * Replace `` with the timestamp key you specified when compressing logs for