diff --git a/tika-grpc/pom.xml b/tika-grpc/pom.xml index 679cd5ffd88..d6ff82a18d8 100644 --- a/tika-grpc/pom.xml +++ b/tika-grpc/pom.xml @@ -222,6 +222,11 @@ tika-fetcher-http ${project.version} + + org.apache.tika + tika-fetcher-google + ${project.version} + com.fasterxml.jackson.module jackson-module-jsonSchema diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml index 7ebdba7db8e..1e0b098988f 100644 --- a/tika-parent/pom.xml +++ b/tika-parent/pom.xml @@ -643,6 +643,16 @@ com.google.guava guava ${guava.version} + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + + com.googlecode.json-simple diff --git a/tika-pipes/tika-fetchers/pom.xml b/tika-pipes/tika-fetchers/pom.xml index 999d269fcf6..2fe67146452 100644 --- a/tika-pipes/tika-fetchers/pom.xml +++ b/tika-pipes/tika-fetchers/pom.xml @@ -37,6 +37,7 @@ tika-fetcher-gcs tika-fetcher-az-blob tika-fetcher-microsoft-graph + tika-fetcher-google @@ -45,4 +46,4 @@ 3.0.0-rc1 - \ No newline at end of file + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml new file mode 100644 index 00000000000..2f5e97ead6c --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/pom.xml @@ -0,0 +1,121 @@ + + + + 4.0.0 + + + tika-fetchers + org.apache.tika + 4.0.0-SNAPSHOT + + + tika-fetcher-google + Google Tika Pipes Fetcher + + + 2.7.0 + + + + + + ${project.groupId} + tika-core + ${project.version} + + + + + com.google.api-client + google-api-client + ${google.api.client.version} + + + com.google.auth + google-auth-library-oauth2-http + + + + + + com.google.auth + google-auth-library-oauth2-http + 1.30.0 + + + com.google.errorprone + error_prone_annotations + + + com.google.j2objc + j2objc-annotations + + + + + + + com.google.apis + google-api-services-drive + v3-rev20241027-2.0.0 + + + + + org.slf4j + slf4j-api + + + + + commons-io + commons-io + + + + + org.junit.jupiter + junit-jupiter + test + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + + + org.apache.tika.pipes.fetcher.s3 + + + + + + + + + 3.0.0-BETA-rc1 + + diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java new file mode 100644 index 00000000000..94a21740ee4 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/GoogleDriveFetcher.java @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.google; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Base64; +import java.util.List; +import java.util.Map; + +import com.google.api.client.googleapis.javanet.GoogleNetHttpTransport; +import com.google.api.client.http.HttpRequestInitializer; +import com.google.api.client.json.JsonFactory; +import com.google.api.client.json.gson.GsonFactory; +import com.google.api.services.drive.Drive; +import com.google.api.services.drive.DriveScopes; +import com.google.auth.http.HttpCredentialsAdapter; +import com.google.auth.oauth2.GoogleCredentials; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.tika.config.Field; +import org.apache.tika.config.Initializable; +import org.apache.tika.config.InitializableProblemHandler; +import org.apache.tika.config.Param; +import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.TemporaryResources; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.pipes.fetcher.AbstractFetcher; +import org.apache.tika.pipes.fetchers.google.config.GoogleDriveFetcherConfig; + + +/** + * GoogleDrive Fetcher allows the fetching of files from a Google Drive, using a + * service account key. + * + * Fetch Keys are ${fileId},${subjectUser}, where the subject user is the + * organizer of the file. This user is necessary as part of the key as the + * service account must act on behalf of the user when querying for the file. + */ +public class GoogleDriveFetcher extends AbstractFetcher implements Initializable { + private static final Logger LOGGER = LoggerFactory.getLogger(GoogleDriveFetcher.class); + private static final JsonFactory JSON_FACTORY = GsonFactory.getDefaultInstance(); + + private GoogleCredentials baseCredentials; + + private Drive driveService; + private boolean spoolToTemp; + private List scopes; + + private GoogleDriveFetcherConfig config = new GoogleDriveFetcherConfig(); + + public GoogleDriveFetcher() { + scopes = new ArrayList<>(); + scopes.add(DriveScopes.DRIVE_READONLY); + } + + public GoogleDriveFetcher(GoogleDriveFetcherConfig config) { + this.config = config; + } + + @Field + public void setThrottleSeconds(String commaDelimitedLongs) throws TikaConfigException { + String[] longStrings = (commaDelimitedLongs == null ? "" : commaDelimitedLongs).split(","); + long[] seconds = new long[longStrings.length]; + for (int i = 0; i < longStrings.length; i++) { + try { + seconds[i] = Long.parseLong(longStrings[i]); + } catch (NumberFormatException e) { + throw new TikaConfigException(e.getMessage()); + } + } + setThrottleSeconds(seconds); + } + + public void setThrottleSeconds(long[] throttleSeconds) { + config.setThrottleSeconds(throttleSeconds); + } + + @Field + public void setSpoolToTemp(boolean spoolToTemp) { + config.setSpoolToTemp(spoolToTemp); + } + + @Field + public void setServiceAccountKeyBase64(String serviceAccountKeyBase64) { + config.setServiceAccountKeyBase64(serviceAccountKeyBase64); + } + + @Field + public void setSubjectUser(String subjectUser) { + config.setSubjectUser(subjectUser); + } + + @Field + public void setScopes(List scopes) { + config.setScopes(new ArrayList<>(scopes)); + if (config.getScopes().isEmpty()) { + config.getScopes().add(DriveScopes.DRIVE_READONLY); + } + } + + @Override + public void initialize(Map map) throws TikaConfigException { + try { + baseCredentials = GoogleCredentials + .fromStream(new ByteArrayInputStream(Base64.getDecoder().decode(config.getServiceAccountKeyBase64()))) + .createScoped(scopes); + } catch (IOException e) { + throw new TikaConfigException("Failed to initialize Google Drive service", e); + } + } + + @Override + public void checkInitialization(InitializableProblemHandler initializableProblemHandler) throws TikaConfigException { + } + + @Override + public InputStream fetch(String fetchKey, Metadata metadata, ParseContext parseContext) throws TikaException, IOException { + int tries = 0; + Exception ex = null; + TemporaryResources tmp = null; + + do { + long start = System.currentTimeMillis(); + try { + String[] fetchKeySplit = fetchKey.split(","); + if (fetchKeySplit.length != 2) { + throw new TikaException("Invalid fetch key, expected format ${fileId},${subjectUser}: " + fetchKey); + } + + String fileId = fetchKeySplit[0]; + String subjectUser = fetchKeySplit[1]; + + GoogleCredentials delegatedCredentials = baseCredentials.createDelegated(subjectUser); + final HttpRequestInitializer requestInitializer = new HttpCredentialsAdapter(delegatedCredentials); + + driveService = new Drive.Builder( + GoogleNetHttpTransport.newTrustedTransport(), + JSON_FACTORY, + requestInitializer).setApplicationName("tika-fetcher-google").build(); + + InputStream is = driveService.files() + .get(fileId) + .executeMediaAsInputStream(); + + if (is == null) { + throw new IOException("Empty input stream when we tried to parse " + fetchKey); + } + + if (spoolToTemp) { + tmp = new TemporaryResources(); + Path tmpPath = tmp.createTempFile(fileId + ".dat"); + Files.copy(is, tmpPath); + return TikaInputStream.get(tmpPath); + } + return TikaInputStream.get(is); + + } catch (Exception e) { + LOGGER.warn("Exception fetching on retry=" + tries, e); + ex = e; + } finally { + long elapsed = System.currentTimeMillis() - start; + LOGGER.debug("Total to fetch {}", elapsed); + } + + long[] throttleSeconds = config.getThrottleSeconds(); + + LOGGER.warn("Sleeping for {} seconds before retry", throttleSeconds[tries]); + try { + Thread.sleep(throttleSeconds[tries] * 1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } while (++tries < config.getThrottleSeconds().length); + + throw new TikaException("Could not fetch " + fetchKey, ex); + } +} diff --git a/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java new file mode 100644 index 00000000000..f03db469559 --- /dev/null +++ b/tika-pipes/tika-fetchers/tika-fetcher-google/src/main/java/org/apache/tika/pipes/fetchers/google/config/GoogleDriveFetcherConfig.java @@ -0,0 +1,78 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.pipes.fetchers.google.config; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.tika.pipes.fetcher.config.AbstractConfig; + +public class GoogleDriveFetcherConfig extends AbstractConfig { + private long[] throttleSeconds; + private boolean spoolToTemp; + protected String serviceAccountKeyBase64; + protected String subjectUser; + private List scopes = new ArrayList<>(); + + public boolean isSpoolToTemp() { + return spoolToTemp; + } + + public GoogleDriveFetcherConfig setSpoolToTemp(boolean spoolToTemp) { + this.spoolToTemp = spoolToTemp; + return this; + } + + public long[] getThrottleSeconds() { + if (throttleSeconds == null) { + return new long[]{5, 10, 15}; // Default retry intervals + } + return throttleSeconds; + } + + public GoogleDriveFetcherConfig setThrottleSeconds(long[] throttleSeconds) { + this.throttleSeconds = throttleSeconds; + return this; + } + + public String getServiceAccountKeyBase64() { + return serviceAccountKeyBase64; + } + + public GoogleDriveFetcherConfig setServiceAccountKeyBase64(String serviceAccountKeyBase64) { + this.serviceAccountKeyBase64 = serviceAccountKeyBase64; + return this; + } + + public String getSubjectUser() { + return subjectUser; + } + + public GoogleDriveFetcherConfig setSubjectUser(String subjectUser) { + this.subjectUser = subjectUser; + return this; + } + + public List getScopes() { + return scopes; + } + + public GoogleDriveFetcherConfig setScopes(List scopes) { + this.scopes = scopes; + return this; + } +}