diff --git a/tika-grpc/pom.xml b/tika-grpc/pom.xml
index 679cd5ffd88..d6ff82a18d8 100644
--- a/tika-grpc/pom.xml
+++ b/tika-grpc/pom.xml
@@ -222,6 +222,11 @@
tika-fetcher-http
${project.version}
+
+ org.apache.tika
+ tika-fetcher-google
+ ${project.version}
+
com.fasterxml.jackson.module
jackson-module-jsonSchema
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 7ebdba7db8e..1e0b098988f 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -643,6 +643,16 @@
com.google.guava
guava
${guava.version}
+
+
+ com.google.errorprone
+ error_prone_annotations
+
+
+ com.google.j2objc
+ j2objc-annotations
+
+
com.googlecode.json-simple
diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml
index 999d269fcf6..2fe67146452 100644
--- a/tika-pipes/tika-fetchers/pom.xml
+++ b/tika-pipes/tika-fetchers/pom.xml
@@ -37,6 +37,7 @@
tika-fetcher-gcs
tika-fetcher-az-blob
tika-fetcher-microsoft-graph
+ tika-fetcher-google
@@ -45,4 +46,4 @@
3.0.0-rc1
-
\ No newline at end of file
+
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml
new file mode 100644
index 00000000000..2f5e97ead6c
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml
@@ -0,0 +1,121 @@
+
+
+
+ 4.0.0
+
+
+ tika-fetchers
+ org.apache.tika
+ 4.0.0-SNAPSHOT
+
+
+ tika-fetcher-google
+ Google Tika Pipes Fetcher
+
+
+ 2.7.0
+
+
+
+
+
+ ${project.groupId}
+ tika-core
+ ${project.version}
+
+
+
+
+ com.google.api-client
+ google-api-client
+ ${google.api.client.version}
+
+
+ com.google.auth
+ google-auth-library-oauth2-http
+
+
+
+
+
+ com.google.auth
+ google-auth-library-oauth2-http
+ 1.30.0
+
+
+ com.google.errorprone
+ error_prone_annotations
+
+
+ com.google.j2objc
+ j2objc-annotations
+
+
+
+
+
+
+ com.google.apis
+ google-api-services-drive
+ v3-rev20241027-2.0.0
+
+
+
+
+ org.slf4j
+ slf4j-api
+
+
+
+
+ commons-io
+ commons-io
+
+
+
+
+ org.junit.jupiter
+ junit-jupiter
+ test
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+
+
+ org.apache.tika.pipes.fetcher.s3
+
+
+
+
+
+
+
+
+ 3.0.0-BETA-rc1
+
+
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java
new file mode 100644
index 00000000000..94a21740ee4
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java
@@ -0,0 +1,200 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.google;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Base64;
+import java.util.List;
+import java.util.Map;
+
+import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport;
+import com.google.api.client.http.HttpRequestInitializer;
+import com.google.api.client.json.JsonFactory;
+import com.google.api.client.json.gson.GsonFactory;
+import com.google.api.services.drive.Drive;
+import com.google.api.services.drive.DriveScopes;
+import com.google.auth.http.HttpCredentialsAdapter;
+import com.google.auth.oauth2.GoogleCredentials;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.tika.config.Field;
+import org.apache.tika.config.Initializable;
+import org.apache.tika.config.InitializableProblemHandler;
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.apache.tika.pipes.fetchers.google.config.GoogleDriveFetcherConfig;
+
+
+/**
+ * GoogleDrive Fetcher allows the fetching of files from a Google Drive, using a
+ * service account key.
+ *
+ * Fetch Keys are ${fileId},${subjectUser}, where the subject user is the
+ * organizer of the file. This user is necessary as part of the key as the
+ * service account must act on behalf of the user when querying for the file.
+ */
+public class GoogleDriveFetcher extends AbstractFetcher implements Initializable {
+ private static final Logger LOGGER = LoggerFactory.getLogger(GoogleDriveFetcher.class);
+ private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance();
+
+ private GoogleCredentials baseCredentials;
+
+ private Drive driveService;
+ private boolean spoolToTemp;
+ private List scopes;
+
+ private GoogleDriveFetcherConfig config = new GoogleDriveFetcherConfig();
+
+ public GoogleDriveFetcher() {
+ scopes = new ArrayList<>();
+ scopes.add(DriveScopes.DRIVE_READONLY);
+ }
+
+ public GoogleDriveFetcher(GoogleDriveFetcherConfig config) {
+ this.config = config;
+ }
+
+ @Field
+ public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException {
+ String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(",");
+ long[] seconds = new long[longStrings.length];
+ for (int i = 0; i < longStrings.length; i++) {
+ try {
+ seconds[i] = Long.parseLong(longStrings[i]);
+ } catch (NumberFormatException e) {
+ throw new TikaConfigException(e.getMessage());
+ }
+ }
+ setThrottleSeconds(seconds);
+ }
+
+ public void setThrottleSeconds(long[] throttleSeconds) {
+ config.setThrottleSeconds(throttleSeconds);
+ }
+
+ @Field
+ public void setSpoolToTemp(boolean spoolToTemp) {
+ config.setSpoolToTemp(spoolToTemp);
+ }
+
+ @Field
+ public void setServiceAccountKeyBase64(String serviceAccountKeyBase64) {
+ config.setServiceAccountKeyBase64(serviceAccountKeyBase64);
+ }
+
+ @Field
+ public void setSubjectUser(String subjectUser) {
+ config.setSubjectUser(subjectUser);
+ }
+
+ @Field
+ public void setScopes(List scopes) {
+ config.setScopes(new ArrayList<>(scopes));
+ if (config.getScopes().isEmpty()) {
+ config.getScopes().add(DriveScopes.DRIVE_READONLY);
+ }
+ }
+
+ @Override
+ public void initialize(Map map) throws TikaConfigException {
+ try {
+ baseCredentials = GoogleCredentials
+ .fromStream(new ByteArrayInputStream(Base64.getDecoder().decode(config.getServiceAccountKeyBase64())))
+ .createScoped(scopes);
+ } catch (IOException e) {
+ throw new TikaConfigException("Failed to initialize Google Drive service", e);
+ }
+ }
+
+ @Override
+ public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException {
+ }
+
+ @Override
+ public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException {
+ int tries = 0;
+ Exception ex = null;
+ TemporaryResources tmp = null;
+
+ do {
+ long start = System.currentTimeMillis();
+ try {
+ String[] fetchKeySplit = fetchKey.split(",");
+ if (fetchKeySplit.length != 2) {
+ throw new TikaException("Invalid fetch key, expected format ${fileId},${subjectUser}: " + fetchKey);
+ }
+
+ String fileId = fetchKeySplit[0];
+ String subjectUser = fetchKeySplit[1];
+
+ GoogleCredentials delegatedCredentials = baseCredentials.createDelegated(subjectUser);
+ final HttpRequestInitializer requestInitializer = new HttpCredentialsAdapter(delegatedCredentials);
+
+ driveService = new Drive.Builder(
+ GoogleNetHttpTransport.newTrustedTransport(),
+ JSON_FACTORY,
+ requestInitializer).setApplicationName("tika-fetcher-google").build();
+
+ InputStream is = driveService.files()
+ .get(fileId)
+ .executeMediaAsInputStream();
+
+ if (is == null) {
+ throw new IOException("Empty input stream when we tried to parse " + fetchKey);
+ }
+
+ if (spoolToTemp) {
+ tmp = new TemporaryResources();
+ Path tmpPath = tmp.createTempFile(fileId + ".dat");
+ Files.copy(is, tmpPath);
+ return TikaInputStream.get(tmpPath);
+ }
+ return TikaInputStream.get(is);
+
+ } catch (Exception e) {
+ LOGGER.warn("Exception fetching on retry=" + tries, e);
+ ex = e;
+ } finally {
+ long elapsed = System.currentTimeMillis() - start;
+ LOGGER.debug("Total to fetch {}", elapsed);
+ }
+
+ long[] throttleSeconds = config.getThrottleSeconds();
+
+ LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]);
+ try {
+ Thread.sleep(throttleSeconds[tries] * 1000);
+ } catch (InterruptedException e) {
+ Thread.currentThread().interrupt();
+ }
+ } while (++tries < config.getThrottleSeconds().length);
+
+ throw new TikaException("Could not fetch " + fetchKey, ex);
+ }
+}
diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java
new file mode 100644
index 00000000000..f03db469559
--- /dev/null
+++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java
@@ -0,0 +1,78 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.pipes.fetchers.google.config;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.tika.pipes.fetcher.config.AbstractConfig;
+
+public class GoogleDriveFetcherConfig extends AbstractConfig {
+ private long[] throttleSeconds;
+ private boolean spoolToTemp;
+ protected String serviceAccountKeyBase64;
+ protected String subjectUser;
+ private List scopes = new ArrayList<>();
+
+ public boolean isSpoolToTemp() {
+ return spoolToTemp;
+ }
+
+ public GoogleDriveFetcherConfig setSpoolToTemp(boolean spoolToTemp) {
+ this.spoolToTemp = spoolToTemp;
+ return this;
+ }
+
+ public long[] getThrottleSeconds() {
+ if (throttleSeconds == null) {
+ return new long[]{5, 10, 15}; // Default retry intervals
+ }
+ return throttleSeconds;
+ }
+
+ public GoogleDriveFetcherConfig setThrottleSeconds(long[] throttleSeconds) {
+ this.throttleSeconds = throttleSeconds;
+ return this;
+ }
+
+ public String getServiceAccountKeyBase64() {
+ return serviceAccountKeyBase64;
+ }
+
+ public GoogleDriveFetcherConfig setServiceAccountKeyBase64(String serviceAccountKeyBase64) {
+ this.serviceAccountKeyBase64 = serviceAccountKeyBase64;
+ return this;
+ }
+
+ public String getSubjectUser() {
+ return subjectUser;
+ }
+
+ public GoogleDriveFetcherConfig setSubjectUser(String subjectUser) {
+ this.subjectUser = subjectUser;
+ return this;
+ }
+
+ public List getScopes() {
+ return scopes;
+ }
+
+ public GoogleDriveFetcherConfig setScopes(List scopes) {
+ this.scopes = scopes;
+ return this;
+ }
+}