Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
139 changes: 70 additions & 69 deletions babeldoc/format/pdf/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,31 +125,31 @@ def add_metadata(
processed.append(path)

temp_path = translate_config.get_working_file_path(f"{path.stem}.cmap.pdf")
pdf = pymupdf.open(path)
meta = pdf.metadata
if not meta:
meta = {}
creator = meta.get("creator", None)
producer = meta.get("producer", None)
if producer:
if not creator:
creator = producer
else:
creator += f", {producer}"
with pymupdf.open(path) as pdf:
meta = pdf.metadata
if not meta:
meta = {}
creator = meta.get("creator", None)
producer = meta.get("producer", None)
if producer:
if not creator:
creator = producer
else:
creator += f", {producer}"

translated_by = f"BabelDOC{WATERMARK_VERSION}_{time.time()}_Translation_generated_by_AI,please_carefully_discern"
if translate_config.metadata_extra_data:
translated_by += f"_{translate_config.metadata_extra_data}"
meta["producer"] = translated_by
meta["creator"] = creator
translated_by = f"BabelDOC{WATERMARK_VERSION}_{time.time()}_Translation_generated_by_AI,please_carefully_discern"
if translate_config.metadata_extra_data:
translated_by += f"_{translate_config.metadata_extra_data}"
meta["producer"] = translated_by
meta["creator"] = creator

for k, v in meta.items():
if v:
# 使用正则替换掉 surrogate 范围内的字符
meta[k] = re.sub(r"[\uD800-\uDFFF]", "", v)
for k, v in meta.items():
if v:
# 使用正则替换掉 surrogate 范围内的字符
meta[k] = re.sub(r"[\uD800-\uDFFF]", "", v)

pdf.set_metadata(meta)
safe_save(pdf, temp_path)
pdf.set_metadata(meta)
safe_save(pdf, temp_path)
shutil.move(temp_path, path)


Expand All @@ -167,9 +167,9 @@ def fix_cmap(translate_result: TranslateResult, translate_config: TranslationCon
processed.append(path)

temp_path = translate_config.get_working_file_path(f"{path.stem}.cmap.pdf")
pdf = pymupdf.open(path)
reproduce_cmap(pdf)
safe_save(pdf, temp_path)
with pymupdf.open(path) as pdf:
reproduce_cmap(pdf)
safe_save(pdf, temp_path)
shutil.move(temp_path, path)


Expand Down Expand Up @@ -472,7 +472,8 @@ def do_translate(
original_pdf_path = translation_config.input_file
logger.info(f"start to translate: {original_pdf_path}")
try:
check_metadata(Document(original_pdf_path))
with Document(original_pdf_path) as _doc:
check_metadata(_doc)
except InputFileGeneratedByBabelDOCError as e:
logger.error(
f"input file {original_pdf_path} is generated by BabelDOC, Cannot translate files that have already been translated."
Expand Down Expand Up @@ -679,56 +680,56 @@ def migrate_toc(
if translation_config.use_alternating_pages_dual:
logger.info('skipping TOC migration for "use_alternating_pages_dual" mode')
return
old_doc = Document(translation_config.input_file)
if not old_doc:
return
try:
fix_filter(old_doc)
fix_null_xref(old_doc)
except Exception:
logger.exception("auto fix failed, please check the pdf file")

toc_data = old_doc.get_toc()
with Document(translation_config.input_file) as old_doc:
if not old_doc:
return
try:
fix_filter(old_doc)
fix_null_xref(old_doc)
except Exception:
logger.exception("auto fix failed, please check the pdf file")

if not toc_data:
logger.info("No TOC found in the original PDF, skipping migration.")
return
toc_data = old_doc.get_toc()

if translation_config.only_include_translated_page:
total_page = set(range(0, len(old_doc)))
if not toc_data:
logger.info("No TOC found in the original PDF, skipping migration.")
return

pages_to_translate = {
i for i in len(old_doc) if translation_config.should_translate_page(i + 1)
}
if translation_config.only_include_translated_page:
total_page = set(range(0, len(old_doc)))

should_removed_page = list(total_page - pages_to_translate)
pages_to_translate = {
+ i for i in range(len(old_doc)) if translation_config.should_translate_page(i + 1)
}

files = {
translate_result.dual_pdf_path,
# translate_result.mono_pdf_path,
translate_result.no_watermark_dual_pdf_path,
# translate_result.no_watermark_mono_pdf_path
}
should_removed_page = list(total_page - pages_to_translate)

for f in files:
if not f:
continue
mig_toc_temp_input = translation_config.get_working_file_path(
"mig_toc_temp.pdf"
)
shutil.copy(f, mig_toc_temp_input)
new_doc = Document(mig_toc_temp_input.as_posix())
if not new_doc:
continue
files = {
translate_result.dual_pdf_path,
# translate_result.mono_pdf_path,
translate_result.no_watermark_dual_pdf_path,
# translate_result.no_watermark_mono_pdf_path
}

new_doc.set_toc(toc_data)
PDFCreater.save_pdf_with_timeout(
new_doc,
f.as_posix(),
translation_config=translation_config,
clean=not translation_config.skip_clean,
tag="mig_toc",
)
for f in files:
if not f:
continue
mig_toc_temp_input = translation_config.get_working_file_path(
"mig_toc_temp.pdf"
)
shutil.copy(f, mig_toc_temp_input)
with Document(mig_toc_temp_input.as_posix()) as new_doc:
if not new_doc:
continue

new_doc.set_toc(toc_data)
PDFCreater.save_pdf_with_timeout(
new_doc,
f.as_posix(),
translation_config=translation_config,
clean=not translation_config.skip_clean,
tag="mig_toc",
)


# mediabox -> '[0 nul 792]'
Expand Down