-
Notifications
You must be signed in to change notification settings - Fork 214
Expand file tree
/
Copy pathpdf_extractor.rb
More file actions
166 lines (147 loc) · 6.1 KB
/
pdf_extractor.rb
File metadata and controls
166 lines (147 loc) · 6.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
require 'rbconfig'
module Docsplit
class PdfExtractor
@@executable = nil
@@version_string = nil
# Provide a set of helper functions to determine the OS.
HOST_OS = (defined?("RbConfig") ? RbConfig : Config)::CONFIG['host_os']
def windows?
!!HOST_OS.match(/mswin|windows|cygwin/i)
end
def osx?
!!HOST_OS.match(/darwin/i)
end
def linux?
!!HOST_OS.match(/linux/i)
end
# The first line of the help output holds the name and version number
# of the office software to be used for extraction.
def version_string
unless @@version_string
null = windows? ? "NUL" : "/dev/null"
@@version_string = `#{office_executable} -h 2>#{null}`.split("\n").first
if !!@@version_string.to_s.match(/[0-9]*/)
@@version_string = `#{office_executable} --version`.split("\n").first
end
end
@@version_string
end
def libre_office?
!!version_string.match(/^LibreOffice/)
end
def open_office?
!!version_string.match(/^OpenOffice.org/)
end
# A set of default locations to search for office software
# These have been extracted from JODConverter. Each listed
# path should contain a directory "program" which in turn
# contains the "soffice" executable.
# see: https://github.com/mirkonasato/jodconverter/blob/master/jodconverter-core/src/main/java/org/artofsolving/jodconverter/office/OfficeUtils.java#L63-L91
def office_search_paths
if windows?
office_names = ["LibreOffice 3", "LibreOffice 4", "OpenOffice.org 3"]
program_files_path = ENV["CommonProgramFiles"]
search_paths = office_names.map{ |program| File.join(program_files_path, program) }
elsif osx?
search_paths = %w(
/Applications/LibreOffice.app/Contents
/Applications/OpenOffice.org.app/Contents
)
else # probably linux/unix
# heroku libreoffice buildpack: https://github.com/rishihahs/heroku-buildpack-libreoffice
search_paths = %w(
/usr/lib/libreoffice
/usr/lib64/libreoffice
/opt/libreoffice
/usr/lib/openoffice
/usr/lib64/openoffice
/opt/openoffice.org3
/app/vendor/libreoffice
/usr/bin/libreoffice
/usr/local/bin
/usr/lib64/libreoffice
/usr/lib64/openoffice.org3
)
end
search_paths
end
# Identify the path to a working office executable.
def office_executable
paths = office_search_paths
# If an OFFICE_PATH has been specified on the commandline
# raise an error if that path isn't valid, otherwise, add
# it to the front of our search paths.
if ENV['OFFICE_PATH']
raise ArgumentError, "No such file or directory #{ENV['OFFICE_PATH']}" unless File.exist? ENV['OFFICE_PATH']
paths.unshift(ENV['OFFICE_PATH'])
end
# The location of the office executable is OS dependent
path_pieces = ["soffice"]
if windows?
path_pieces += [["program", "soffice.bin"]]
elsif osx?
path_pieces += [["MacOS", "soffice"], ["Contents", "MacOS", "soffice"]]
else
path_pieces += [["program", "soffice"]]
end
# Search for the first suitable office executable
# and short circuit an executable is found.
paths.each do |path|
if File.exist? path
@@executable ||= path unless File.directory? path
path_pieces.each do |pieces|
check_path = File.join(path, pieces)
@@executable ||= check_path if File.exist? check_path
end
end
break if @@executable
end
raise OfficeNotFound, "No office software found" unless @@executable
@@executable
end
# Used to specify the office location for JODConverter
def office_path
File.dirname(File.dirname(office_executable))
end
# Convert documents to PDF.
def extract(docs, opts)
out = opts[:output] || '.'
FileUtils.mkdir_p out unless File.exist?(out)
[docs].flatten.each do |doc|
ext = File.extname(doc)
basename = File.basename(doc, ext)
escaped_doc, escaped_out, escaped_basename = [doc, out, basename].map(&ESCAPE)
if GM_FORMATS.include?(`file -b --mime #{ESCAPE[doc]}`.strip.split(/[:;]\s+/)[0])
`gm convert #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf`
else
if libre_office?
# Set the LibreOffice user profile, so that parallel uses of cloudcrowd don't trip over each other.
ENV['SYSUSERCONFIG']="file://#{File.expand_path(escaped_out)}"
options = "--headless --invisible --norestore --nolockcheck --convert-to pdf --outdir #{escaped_out} #{escaped_doc}"
cmd = "#{office_executable} #{options} 2>&1"
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
true
else # open office presumably, rely on JODConverter to figure it out.
options = "-jar #{ESCAPED_ROOT}/vendor/jodconverter/jodconverter-core-3.0-beta-4.jar -r #{ESCAPED_ROOT}/vendor/conf/document-formats.js"
run_jod "#{options} #{escaped_doc} #{escaped_out}/#{escaped_basename}.pdf", [], {}
end
end
end
end
CLASSPATH = "#{ESCAPED_ROOT}/build#{File::PATH_SEPARATOR}#{ESCAPED_ROOT}/vendor/'*'"
LOGGING = "-Djava.util.logging.config.file=#{ESCAPED_ROOT}/vendor/logging.properties"
HEADLESS = "-Djava.awt.headless=true"
private
# Runs a Java command, with quieted logging, and the classpath set properly.
def run_jod(command, pdfs, opts, return_output=false)
pdfs = [pdfs].flatten.map{|pdf| "\"#{pdf}\""}.join(' ')
office = osx? ? "-Doffice.home=#{office_path}" : office_path
cmd = "java #{HEADLESS} #{LOGGING} #{office} -cp #{CLASSPATH} #{command} #{pdfs} 2>&1"
result = `#{cmd}`.chomp
raise ExtractionFailed, result if $? != 0
return return_output ? (result.empty? ? nil : result) : true
end
class OfficeNotFound < StandardError; end
end
end