diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml new file mode 100644 index 00000000000..a412c2a0616 --- /dev/null +++ b/.github/workflows/docker-release.yml @@ -0,0 +1,154 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Docker release - tika-server and tika-grpc + +on: + push: + tags: + - '[0-9]+.[0-9]+.[0-9]+*' + +jobs: + release-tika-server: + runs-on: ubuntu-latest + timeout-minutes: 60 + + steps: + - uses: actions/checkout@v4 + + - name: Extract version from tag + id: version + run: | + TAG_NAME="${GITHUB_REF#refs/tags/}" + echo "tag=${TAG_NAME}" >> "$GITHUB_OUTPUT" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU for multi-arch + uses: docker/setup-qemu-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push tika-server minimal + uses: docker/build-push-action@v6 + with: + file: tika-server/docker-build/minimal/Dockerfile + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tag }} + tags: | + apache/tika:${{ steps.version.outputs.tag }} + apache/tika:latest + + - name: Build and push tika-server full + uses: docker/build-push-action@v6 + with: + file: tika-server/docker-build/full/Dockerfile + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tag }} + tags: | + apache/tika:${{ steps.version.outputs.tag }}-full + apache/tika:latest-full + + release-tika-grpc: + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - uses: actions/checkout@v4 + + - name: Extract version from tag + id: version + run: | + TAG_NAME="${GITHUB_REF#refs/tags/}" + echo "tag=${TAG_NAME}" >> "$GITHUB_OUTPUT" + + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + cache: 'maven' + + - name: Build with Maven (skip tests) + run: mvn clean install -DskipTests -B "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU for multi-arch + uses: docker/setup-qemu-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Prepare tika-grpc Docker build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tag }}" + OUT_DIR=target/tika-grpc-docker + + mkdir -p "${OUT_DIR}/libs" "${OUT_DIR}/plugins" "${OUT_DIR}/config" "${OUT_DIR}/bin" + + cp "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs/" + + # Copy tika-pipes plugin zip files + for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp "$zip_file" "${OUT_DIR}/plugins/" + fi + done + + # Copy parser packages + for parser_package in \ + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" \ + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp "$jar_file" "${OUT_DIR}/plugins/" + fi + done + + cp "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin/" + cp "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-grpc + uses: docker/build-push-action@v6 + with: + context: target/tika-grpc-docker + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + VERSION=${{ steps.version.outputs.tag }} + tags: | + apache/tika-grpc:${{ steps.version.outputs.tag }} + apache/tika-grpc:latest diff --git a/.github/workflows/docker-snapshot.yml b/.github/workflows/docker-snapshot.yml new file mode 100644 index 00000000000..b43df8422df --- /dev/null +++ b/.github/workflows/docker-snapshot.yml @@ -0,0 +1,147 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Docker snapshot - tika-server and tika-grpc + +on: + push: + branches: [ main ] + paths-ignore: + - 'docs/**' + - '*.md' + +jobs: + build: + runs-on: ubuntu-latest + timeout-minutes: 120 + + steps: + - uses: actions/checkout@v4 + + - name: Set up JDK 17 + uses: actions/setup-java@v4 + with: + distribution: 'temurin' + java-version: '17' + cache: 'maven' + + - name: Extract version from pom + id: version + run: | + TIKA_VERSION=$(mvn help:evaluate -Dexpression=project.version -q -DforceStdout) + echo "tika_version=${TIKA_VERSION}" >> "$GITHUB_OUTPUT" + + - name: Build with Maven (skip tests) + run: mvn clean install -DskipTests -B "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU for multi-arch + uses: docker/setup-qemu-action@v3 + + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + # --- tika-server (minimal) --- + - name: Prepare tika-server minimal build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tika_version }}" + OUT_DIR=target/tika-server-minimal-docker + mkdir -p "${OUT_DIR}/tika-server" + tar xzf "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}-bin.tgz" -C "${OUT_DIR}/tika-server" + cp "tika-server/docker-build/minimal/Dockerfile.snapshot" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-server minimal snapshot + uses: docker/build-push-action@v6 + with: + context: target/tika-server-minimal-docker + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tika_version }} + tags: | + apache/tika:${{ steps.version.outputs.tika_version }} + + # --- tika-server (full) --- + - name: Prepare tika-server full build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tika_version }}" + OUT_DIR=target/tika-server-full-docker + mkdir -p "${OUT_DIR}/tika-server" + tar xzf "tika-server/tika-server-standard/target/tika-server-standard-${TIKA_VERSION}-bin.tgz" -C "${OUT_DIR}/tika-server" + cp "tika-server/docker-build/full/Dockerfile.snapshot" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-server full snapshot + uses: docker/build-push-action@v6 + with: + context: target/tika-server-full-docker + platforms: linux/amd64,linux/arm64,linux/arm/v7,linux/s390x + push: true + build-args: | + TIKA_VERSION=${{ steps.version.outputs.tika_version }} + tags: | + apache/tika:${{ steps.version.outputs.tika_version }}-full + + # --- tika-grpc --- + - name: Prepare tika-grpc Docker build context + run: | + TIKA_VERSION="${{ steps.version.outputs.tika_version }}" + OUT_DIR=target/tika-grpc-docker + + mkdir -p "${OUT_DIR}/libs" "${OUT_DIR}/plugins" "${OUT_DIR}/config" "${OUT_DIR}/bin" + + cp "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs/" + + # Copy tika-pipes plugin zip files + for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp "$zip_file" "${OUT_DIR}/plugins/" + fi + done + + # Copy parser packages + for parser_package in \ + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" \ + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" \ + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp "$jar_file" "${OUT_DIR}/plugins/" + fi + done + + cp "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin/" + cp "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + + - name: Build and push tika-grpc snapshot + uses: docker/build-push-action@v6 + with: + context: target/tika-grpc-docker + platforms: linux/amd64,linux/arm64 + push: true + build-args: | + VERSION=${{ steps.version.outputs.tika_version }} + tags: | + apache/tika-grpc:${{ steps.version.outputs.tika_version }} diff --git a/tika-grpc/docker-build/Dockerfile b/tika-grpc/docker-build/Dockerfile new file mode 100644 index 00000000000..3740486fbc2 --- /dev/null +++ b/tika-grpc/docker-build/Dockerfile @@ -0,0 +1,62 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# "random" uid/gid hopefully not used anywhere else +# This needs to be set globally and then referenced in +# the subsequent stages -- see TIKA-3912 +ARG UID_GID="35002:35002" + +FROM ubuntu:noble + +ARG UID_GID +COPY libs/ /tika/libs/ +COPY plugins/ /tika/plugins/ +COPY config/ /tika/config/ +COPY bin/ /tika/bin +ARG JRE='openjdk-21-jre-headless' +ARG VERSION +ARG TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=104857600 +ARG TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=104857600 +ARG TIKA_GRPC_NUM_THREADS=4 +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +USER $UID_GID + +EXPOSE 9090 +ENV TIKA_VERSION=$VERSION +ENV TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_INBOUND_MESSAGE_SIZE +ENV TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE=$TIKA_GRPC_MAX_OUTBOUND_MESSAGE_SIZE +ENV TIKA_GRPC_NUM_THREADS=$TIKA_GRPC_NUM_THREADS +RUN chmod +x "/tika/bin/start-tika-grpc.sh" +ENTRYPOINT ["/tika/bin/start-tika-grpc.sh"] + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-grpc/docker-build/docker-build.sh b/tika-grpc/docker-build/docker-build.sh new file mode 100755 index 00000000000..c522ec04fae --- /dev/null +++ b/tika-grpc/docker-build/docker-build.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# This script assembles the Docker build context for tika-grpc and builds the image. +# It is intended to be run from the root of the tika repository after a Maven build. + +set -euo pipefail + +if [ -z "${TIKA_VERSION:-}" ]; then + echo "Environment variable TIKA_VERSION is required, and should match the maven project version of Tika" + exit 1 +fi + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +REPO_ROOT="${SCRIPT_DIR}/../../" + +cd "${REPO_ROOT}" || exit + +OUT_DIR=target/tika-grpc-docker + +MULTI_ARCH=${MULTI_ARCH:-false} +DOCKER_ID=${DOCKER_ID:-} +PROJECT_NAME=${PROJECT_NAME:-tika-grpc} + +# If RELEASE_IMAGE_TAG not specified, use TIKA_VERSION +if [[ -z "${RELEASE_IMAGE_TAG:-}" ]]; then + RELEASE_IMAGE_TAG="${TIKA_VERSION}" + # Remove '-SNAPSHOT' from the version string + RELEASE_IMAGE_TAG="${RELEASE_IMAGE_TAG//-SNAPSHOT/}" +fi + +mkdir -p "${OUT_DIR}/libs" +mkdir -p "${OUT_DIR}/plugins" +mkdir -p "${OUT_DIR}/config" +mkdir -p "${OUT_DIR}/bin" +cp -v -r "tika-grpc/target/tika-grpc-${TIKA_VERSION}.jar" "${OUT_DIR}/libs" + +# Copy all tika-pipes plugin zip files +for dir in tika-pipes/tika-pipes-plugins/*/; do + plugin_name=$(basename "$dir") + zip_file="${dir}target/${plugin_name}-${TIKA_VERSION}.zip" + if [ -f "$zip_file" ]; then + cp -v -r "$zip_file" "${OUT_DIR}/plugins" + else + echo "WARNING: Plugin file $zip_file does not exist, skipping." + fi +done + +# Copy parser package jars as plugins +parser_packages=( + "tika-parsers/tika-parsers-standard/tika-parsers-standard-package" + "tika-parsers/tika-parsers-extended/tika-parser-scientific-package" + "tika-parsers/tika-parsers-extended/tika-parser-sqlite3-package" + "tika-parsers/tika-parsers-ml/tika-parser-nlp-package" +) + +for parser_package in "${parser_packages[@]}"; do + package_name=$(basename "$parser_package") + jar_file="${parser_package}/target/${package_name}-${TIKA_VERSION}.jar" + if [ -f "$jar_file" ]; then + cp -v -r "$jar_file" "${OUT_DIR}/plugins" + else + echo "Parser package file $jar_file does not exist, skipping." + fi +done + +cp -v -r "tika-grpc/docker-build/start-tika-grpc.sh" "${OUT_DIR}/bin" +cp -v "tika-grpc/docker-build/Dockerfile" "${OUT_DIR}/Dockerfile" + +cd "${OUT_DIR}" || exit + +echo "Running docker build from directory: $(pwd)" + +IMAGE_TAGS=() +if [[ -n "${DOCKER_ID}" ]]; then + IMAGE_TAGS+=("-t" "${DOCKER_ID}/${PROJECT_NAME}:${RELEASE_IMAGE_TAG}") +fi + +if [ ${#IMAGE_TAGS[@]} -eq 0 ]; then + echo "No image tags specified. Set DOCKER_ID environment variable to enable Docker build." + exit 0 +fi + +if [ "${MULTI_ARCH}" == "true" ]; then + echo "Building multi-arch image" + docker buildx create --name tikabuilder --use || true + docker buildx build \ + --builder=tikabuilder . \ + "${IMAGE_TAGS[@]}" \ + --build-arg VERSION="${TIKA_VERSION}" \ + --platform linux/amd64,linux/arm64 \ + --push + docker buildx stop tikabuilder + docker buildx rm tikabuilder +else + echo "Building single-arch image" + docker build . "${IMAGE_TAGS[@]}" --build-arg VERSION="${TIKA_VERSION}" +fi + +echo "===================================================================================================" +echo "Done running docker build with tags: ${IMAGE_TAGS[*]}" +echo "===================================================================================================" diff --git a/tika-grpc/docker-build/start-tika-grpc.sh b/tika-grpc/docker-build/start-tika-grpc.sh new file mode 100755 index 00000000000..f041f9268f3 --- /dev/null +++ b/tika-grpc/docker-build/start-tika-grpc.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +TIKA_GRPC_PORT="${TIKA_GRPC_PORT:-9090}" + +echo "Tika Version: ${TIKA_VERSION}" +echo "Tika Plugins:" +ls "/tika/plugins" +echo "Tika gRPC Port: ${TIKA_GRPC_PORT}" + +CONFIG_ARGS=() +if [ -n "${TIKA_CONFIG:-}" ]; then + echo "Tika Config: ${TIKA_CONFIG}" + CONFIG_ARGS+=("-c" "${TIKA_CONFIG}") +fi + +exec java \ + --add-opens=jdk.management/com.sun.management.internal=ALL-UNNAMED \ + --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ + --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ + --add-opens=java.management/com.sun.jmx.mbeanserver=ALL-UNNAMED \ + --add-opens=jdk.internal.jvmstat/sun.jvmstat.monitor=ALL-UNNAMED \ + --add-opens=java.base/sun.reflect.generics.reflectiveObjects=ALL-UNNAMED \ + --add-opens=java.base/java.io=ALL-UNNAMED \ + --add-opens=java.base/java.nio=ALL-UNNAMED \ + --add-opens=java.base/java.util=ALL-UNNAMED \ + --add-opens=java.base/java.lang=ALL-UNNAMED \ + -Djava.net.preferIPv4Stack=true \ + -jar "/tika/libs/tika-grpc-${TIKA_VERSION}.jar" \ + "${CONFIG_ARGS[@]}" \ + -p "${TIKA_GRPC_PORT}" \ + --plugin-roots "/tika/plugins" \ + "$@" diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java index a576ba22c2c..522ff66201e 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServer.java @@ -19,6 +19,11 @@ import static io.grpc.health.v1.HealthCheckResponse.ServingStatus; import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import java.util.concurrent.TimeUnit; import com.beust.jcommander.JCommander; @@ -91,7 +96,8 @@ public void start() throws Exception { creds = InsecureServerCredentials.create(); } if (tikaConfig == null) { - throw new IllegalArgumentException("Tika config file is required"); + tikaConfig = extractDefaultConfig(); + LOGGER.info("No config file specified, using bundled default-tika-config.json"); } File tikaConfigFile = new File(tikaConfig.getAbsolutePath()); healthStatusManager.setStatus(TikaGrpcServer.class.getSimpleName(), ServingStatus.SERVING); @@ -160,6 +166,22 @@ public static void main(String[] args) throws Exception { server.blockUntilShutdown(); } + private static File extractDefaultConfig() { + try (InputStream is = TikaGrpcServer.class.getResourceAsStream("/default-tika-config.json")) { + if (is == null) { + throw new IllegalArgumentException( + "Tika config file is required. Use -c to specify a config file."); + } + Path tempConfig = Files.createTempFile("tika-config-", ".json"); + tempConfig.toFile().deleteOnExit(); + Files.copy(is, tempConfig, StandardCopyOption.REPLACE_EXISTING); + return tempConfig.toFile(); + } catch (IOException e) { + throw new IllegalArgumentException( + "Tika config file is required. Use -c to specify a config file.", e); + } + } + public TikaGrpcServer setTikaConfig(File tikaConfig) { this.tikaConfig = tikaConfig; return this; diff --git a/tika-grpc/src/main/resources/default-tika-config.json b/tika-grpc/src/main/resources/default-tika-config.json new file mode 100644 index 00000000000..2c63c085104 --- /dev/null +++ b/tika-grpc/src/main/resources/default-tika-config.json @@ -0,0 +1,2 @@ +{ +} diff --git a/tika-server/docker-build/docker-tool.sh b/tika-server/docker-build/docker-tool.sh new file mode 100755 index 00000000000..2a82b5fa349 --- /dev/null +++ b/tika-server/docker-build/docker-tool.sh @@ -0,0 +1,131 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +image_name=apache/tika + +stop_and_die() { + docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " + die "$*" +} + +die() { + echo "$*" >&2 + exit 1 +} + +while getopts ":h" opt; do + case ${opt} in + h ) + echo "Usage:" + echo " docker-tool.sh -h Display this help message." + echo " docker-tool.sh build Builds images for ." + echo " docker-tool.sh test Tests images for ." + echo " docker-tool.sh publish Builds multi-arch images for and pushes to Docker Hub." + exit 0 + ;; + \? ) + echo "Invalid Option: -$OPTARG" 1>&2 + exit 1 + ;; + esac +done + +stop_test_container() { + container_name=$1 + docker kill "$container_name" + docker rm "$container_name" +} + +test_docker_image() { + container_name=$1 + image=$image_name:$1 + full=$2 + + docker run -d --name "$container_name" -p 127.0.0.1:9998:9998 "$image" + sleep 10 + url=http://localhost:9998/ + status=$(curl --head --location --connect-timeout 5 --write-out %{http_code} --silent --output /dev/null ${url}) + user=$(docker inspect "$container_name" --format '{{.Config.User}}') + + if [[ $status == '200' ]] + then + echo "$(tput setaf 2)Image: $image - Basic test passed$(tput sgr0)" + else + echo "$(tput setaf 1)Image: $image - Basic test failed$(tput sgr0)" + stop_test_container "$container_name" + exit 1 + fi + + #now test that the user is correctly set + if [[ $user == '35002:35002' ]] + then + echo "$(tput setaf 2)Image: $image - User passed$(tput sgr0)" + else + echo "$(tput setaf 1)Image: $image - User failed$(tput sgr0)" + stop_test_container "$container_name" + exit 1 + fi + + if [ $full == true ] + then + # Test ImageMagick is installed and runnable + if docker exec "$1" /usr/bin/convert -version >/dev/null + then + echo "$(tput setaf 2)Image: $image - ImageMagick passed$(tput sgr0)" + else + echo "$(tput setaf 1)Image: $image - ImageMagick failed$(tput sgr0)" + stop_test_container "$container_name" + exit 1 + fi + fi + + stop_test_container "$container_name" +} + +shift $((OPTIND -1)) +subcommand=$1; shift +tika_docker_version=$1; shift +tika_version=$1; shift + + +case "$subcommand" in + build) + # Build slim tika- with minimal dependencies + docker build -t ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} - < minimal/Dockerfile --no-cache || die "couldn't build minimal" + # Build full tika- with OCR, Fonts and GDAL + docker build -t ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} - < full/Dockerfile --no-cache || die "couldn't build full" + ;; + + test) + # Test the images + test_docker_image ${tika_docker_version} false + test_docker_image "${tika_docker_version}-full" true + ;; + + publish) + docker buildx create --use --name tika-builder || die "couldn't create builder" + # Build multi-arch with buildx and push + docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:latest --tag ${image_name}:${tika_docker_version} --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder minimal || stop_and_die "couldn't build multi-arch minimal" + docker buildx build --platform linux/arm/v7,linux/arm64/v8,linux/amd64,linux/s390x --output "type=image,push=true" \ + --tag ${image_name}:latest-full --tag ${image_name}:${tika_docker_version}-full --build-arg TIKA_VERSION=${tika_version} --no-cache --builder tika-builder full || stop_and_die "couldn't build multi-arch full" + docker buildx rm tika-builder || die "couldn't stop builder -- make sure to stop the builder manually! " + ;; + +esac diff --git a/tika-server/docker-build/full/Dockerfile b/tika-server/docker-build/full/Dockerfile new file mode 100644 index 00000000000..1b918390f62 --- /dev/null +++ b/tika-server/docker-build/full/Dockerfile @@ -0,0 +1,82 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# "random" uid/gid hopefully not used anywhere else +# This needs to be set globally and then referenced in +# the subsequent stages -- see TIKA-3912 +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS base + +FROM base AS fetch_tika + +ARG TIKA_VERSION +ARG CHECK_SIG=true + +ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + TIKA_VERSION=$TIKA_VERSION + +RUN DEBIAN_FRONTEND=noninteractive apt-get update && apt-get -y install gnupg2 wget ca-certificates \ + && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ + && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar + +#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + +FROM base AS runtime +ARG UID_GID +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + imagemagick \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + tesseract-ocr-jpn \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ARG TIKA_VERSION +ENV TIKA_VERSION=$TIKA_VERSION + +COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +USER $UID_GID + +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org" + diff --git a/tika-server/docker-build/full/Dockerfile.snapshot b/tika-server/docker-build/full/Dockerfile.snapshot new file mode 100644 index 00000000000..4f655005e63 --- /dev/null +++ b/tika-server/docker-build/full/Dockerfile.snapshot @@ -0,0 +1,52 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# Snapshot variant: copies the assembly from the Maven build output rather than +# downloading from Apache mirrors. Used for nightly/snapshot Docker builds. + +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS runtime +ARG UID_GID +ARG TIKA_VERSION +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends gnupg2 software-properties-common \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends $JRE \ + gdal-bin \ + imagemagick \ + tesseract-ocr \ + tesseract-ocr-eng \ + tesseract-ocr-ita \ + tesseract-ocr-fra \ + tesseract-ocr-spa \ + tesseract-ocr-deu \ + tesseract-ocr-jpn \ + && echo ttf-mscorefonts-installer msttcorefonts/accepted-mscorefonts-eula select true | debconf-set-selections \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + xfonts-utils \ + fonts-freefont-ttf \ + fonts-liberation \ + ttf-mscorefonts-installer \ + wget \ + cabextract \ + && apt-get clean -y \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ENV TIKA_VERSION=$TIKA_VERSION +COPY tika-server/ /tika-server/ +USER $UID_GID +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/minimal/Dockerfile b/tika-server/docker-build/minimal/Dockerfile new file mode 100644 index 00000000000..1c5195920a4 --- /dev/null +++ b/tika-server/docker-build/minimal/Dockerfile @@ -0,0 +1,70 @@ + +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# "random" uid/gid hopefully not used anywhere else +# This needs to be set globally and then referenced in +# the subsequent stages -- see TIKA-3912 +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS base + +FROM base AS fetch_tika + +ARG TIKA_VERSION +ARG CHECK_SIG=true + +ENV NEAREST_TIKA_SERVER_URL="https://dlcdn.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + ARCHIVE_TIKA_SERVER_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + BACKUP_TIKA_SERVER_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar" \ + DEFAULT_TIKA_SERVER_ASC_URL="https://downloads.apache.org/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + ARCHIVE_TIKA_SERVER_ASC_URL="https://archive.apache.org/dist/tika/${TIKA_VERSION}/tika-server-standard-${TIKA_VERSION}.jar.asc" \ + TIKA_VERSION=$TIKA_VERSION + +RUN set -eux \ + && apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install --yes --no-install-recommends \ + gnupg2 \ + wget \ + ca-certificates \ + && wget -t 10 --max-redirect 1 --retry-connrefused -qO- https://downloads.apache.org/tika/KEYS | gpg --import \ + && wget -t 10 --max-redirect 1 --retry-connrefused $NEAREST_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $ARCHIVE_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || wget $BACKUP_TIKA_SERVER_URL -O /tika-server-standard-${TIKA_VERSION}.jar || rm /tika-server-standard-${TIKA_VERSION}.jar \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar ]" || exit 1 \ + && wget -t 10 --max-redirect 1 --retry-connrefused $DEFAULT_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || wget $ARCHIVE_TIKA_SERVER_ASC_URL -O /tika-server-standard-${TIKA_VERSION}.jar.asc || rm /tika-server-standard-${TIKA_VERSION}.jar.asc \ + && sh -c "[ -f /tika-server-standard-${TIKA_VERSION}.jar.asc ]" || exit 1 \ + && gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar + +# this used to work, but I'm getting "ERROR: failed to solve: failed to prepare $data as $data2: invalid argument" +# when trying to build 2.9.2.0 +#RUN if [ "$CHECK_SIG" = "true" ] ; then gpg --verify /tika-server-standard-${TIKA_VERSION}.jar.asc /tika-server-standard-${TIKA_VERSION}.jar; fi + +FROM base AS runtime +# must reference uid_gid +ARG UID_GID +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends \ + ${JRE} \ + ca-certificates \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ARG TIKA_VERSION +ENV TIKA_VERSION=$TIKA_VERSION +COPY --from=fetch_tika /tika-server-standard-${TIKA_VERSION}.jar /tika-server-standard-${TIKA_VERSION}.jar +USER $UID_GID +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/minimal/Dockerfile.snapshot b/tika-server/docker-build/minimal/Dockerfile.snapshot new file mode 100644 index 00000000000..d701dfee68b --- /dev/null +++ b/tika-server/docker-build/minimal/Dockerfile.snapshot @@ -0,0 +1,34 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +# Snapshot variant: copies the assembly from the Maven build output rather than +# downloading from Apache mirrors. Used for nightly/snapshot Docker builds. + +ARG UID_GID="35002:35002" + +FROM ubuntu:plucky AS runtime +ARG UID_GID +ARG TIKA_VERSION +ARG JRE='openjdk-21-jre-headless' +RUN set -eux \ + && apt-get update \ + && apt-get install --yes --no-install-recommends \ + ${JRE} \ + ca-certificates \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +ENV TIKA_VERSION=$TIKA_VERSION +COPY tika-server/ /tika-server/ +USER $UID_GID +EXPOSE 9998 +ENTRYPOINT [ "/bin/sh", "-c", "exec java -cp \"/tika-server/tika-server.jar:/tika-server/lib/*:/tika-extras/*\" org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 $0 $@"] + +LABEL maintainer="Apache Tika Developers dev@tika.apache.org" diff --git a/tika-server/docker-build/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties b/tika-server/docker-build/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties new file mode 100644 index 00000000000..b4b787ffc65 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# You customise or add the settings you want here +language=eng+spa+fra+deu+ita +timeout=240 +minFileSizeToOcr=1 +enableImageProcessing=0 +density=200 +depth=8 +filter=box +resize=300 +applyRotation=true \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml new file mode 100644 index 00000000000..1c9b613033a --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-inline.xml @@ -0,0 +1,31 @@ + + + + + + + + + + + true + + + + + diff --git a/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml new file mode 100644 index 00000000000..bcd86669963 --- /dev/null +++ b/tika-server/docker-build/sample-configs/customocr/tika-config-rendered.xml @@ -0,0 +1,38 @@ + + + + + + + + + + + + ocr_only + rgb + 100 + + + + + diff --git a/tika-server/docker-build/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties b/tika-server/docker-build/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties new file mode 100644 index 00000000000..44689a2bb3a --- /dev/null +++ b/tika-server/docker-build/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +grobid.server.url=http://grobid:8070 \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/grobid/tika-config.xml b/tika-server/docker-build/sample-configs/grobid/tika-config.xml new file mode 100644 index 00000000000..5b4aad9c725 --- /dev/null +++ b/tika-server/docker-build/sample-configs/grobid/tika-config.xml @@ -0,0 +1,24 @@ + + + + + + application/pdf + + + diff --git a/tika-server/docker-build/sample-configs/ner/run_tika_server.sh b/tika-server/docker-build/sample-configs/ner/run_tika_server.sh new file mode 100755 index 00000000000..fb447be4cfe --- /dev/null +++ b/tika-server/docker-build/sample-configs/ner/run_tika_server.sh @@ -0,0 +1,62 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +############################################################################# +# See https://cwiki.apache.org/confluence/display/TIKA/TikaAndNER for details +# on how to configure additional NER libraries +############################################################################# + +# ------------------------------------ +# Download OpenNLP Models to classpath +# ------------------------------------ + +OPENNLP_LOCATION="/ner/org/apache/tika/parser/ner/opennlp" +URL="http://opennlp.sourceforge.net/models-1.5" + +mkdir -p $OPENNLP_LOCATION +if [ "$(ls -A $OPENNLP_LOCATION/*.bin)" ]; then + echo "OpenNLP models directory has files, so skipping fetch"; +else + echo "No OpenNLP models found, so fetching them" + wget "$URL/en-ner-person.bin" -O $OPENNLP_LOCATION/ner-person.bin + wget "$URL/en-ner-location.bin" -O $OPENNLP_LOCATION/ner-location.bin + wget "$URL/en-ner-organization.bin" -O $OPENNLP_LOCATION/ner-organization.bin; + wget "$URL/en-ner-date.bin" -O $OPENNLP_LOCATION/ner-date.bin + wget "$URL/en-ner-time.bin" -O $OPENNLP_LOCATION/ner-time.bin + wget "$URL/en-ner-percentage.bin" -O $OPENNLP_LOCATION/ner-percentage.bin + wget "$URL/en-ner-money.bin" -O $OPENNLP_LOCATION/ner-money.bin +fi + +# -------------------------------------------- +# Create RexExp Example for Email on classpath +# -------------------------------------------- +REGEXP_LOCATION="/ner/org/apache/tika/parser/ner/regex" +mkdir -p $REGEXP_LOCATION +echo "EMAIL=(?:[a-z0-9!#$%&'*+/=?^_\`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_\`{|}~-]+)*|\"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*\")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])" > $REGEXP_LOCATION/ner-regex.txt + + +# ------------------- +# Now run Tika Server +# ------------------- + +# Can be a single implementation or comma seperated list for multiple for "ner.impl.class" property +RECOGNISERS=org.apache.tika.parser.ner.opennlp.OpenNLPNERecogniser,org.apache.tika.parser.ner.regex.RegexNERecogniser +# Set classpath to the Tika Server JAR and the /ner folder so it has the configuration and models from above +CLASSPATH="/ner:/tika-server-standard-${TIKA_VERSION}.jar:/tika-extras/*" +# Run the server with the custom configuration ner.impl.class property and custom /ner/tika-config.xml +exec java -Dner.impl.class=$RECOGNISERS -cp $CLASSPATH org.apache.tika.server.core.TikaServerCli -h 0.0.0.0 -c /ner/tika-config.xml \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/ner/tika-config.xml b/tika-server/docker-build/sample-configs/ner/tika-config.xml new file mode 100644 index 00000000000..65d5774c22f --- /dev/null +++ b/tika-server/docker-build/sample-configs/ner/tika-config.xml @@ -0,0 +1,28 @@ + + + + + + application/pdf + text/plain + text/html + application/xhtml+xml + + + + diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml new file mode 100644 index 00000000000..c70c207b281 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/inception-rest-caption.xml @@ -0,0 +1,32 @@ + + + + + + image/jpeg + image/png + image/gif + + http://inception-caption:8764/inception/v3 + 5 + 15 + org.apache.tika.parser.captioning.tf.TensorflowRESTCaptioner + + + + \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml b/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml new file mode 100644 index 00000000000..f6a4e6a938c --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/inception-rest-video.xml @@ -0,0 +1,32 @@ + + + + + + video/mp4 + video/quicktime + + http://inception-video:8764/inception/v4 + 4 + 0.015 + fixed + org.apache.tika.parser.recognition.tf.TensorflowRESTVideoRecogniser + + + + \ No newline at end of file diff --git a/tika-server/docker-build/sample-configs/vision/inception-rest.xml b/tika-server/docker-build/sample-configs/vision/inception-rest.xml new file mode 100644 index 00000000000..caa64685952 --- /dev/null +++ b/tika-server/docker-build/sample-configs/vision/inception-rest.xml @@ -0,0 +1,32 @@ + + + + + + image/jpeg + image/png + image/gif + + http://inception-rest:8764/inception/v4 + 2 + 0.015 + org.apache.tika.parser.recognition.tf.TensorflowRESTRecogniser + + + +