From 2dfb61a900d8848a4fa6b0a47f4f1fd3c21127cd Mon Sep 17 00:00:00 2001 From: David Calvo Date: Thu, 19 Feb 2026 19:59:53 -0500 Subject: [PATCH] Add tagged content support inside transparency group XObjects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per ISO 32000-2 ยง14.7.4.4.2, Form XObjects created by push_opacity, push_mask, and push_isolated now participate in the structure tree. Each sub-builder gets its own MCID counter and XObjectTagId. Structure elements reference XObject content via MCR dicts with /Stm pointing to the XObject stream. XObjects with tagged content get /StructParents entries in the parent tree. --- crates/krilla/src/content.rs | 63 +++++++++- crates/krilla/src/graphics/graphic.rs | 2 +- crates/krilla/src/graphics/mask.rs | 11 +- crates/krilla/src/graphics/xobject.rs | 26 ++++ crates/krilla/src/interchange/tagging/fmt.rs | 9 ++ crates/krilla/src/interchange/tagging/mod.rs | 57 +++++++++ crates/krilla/src/serialize.rs | 72 ++++++++++- crates/krilla/src/surface.rs | 119 ++++++++++++++++--- crates/krilla/src/text/type3.rs | 2 +- 9 files changed, 331 insertions(+), 30 deletions(-) diff --git a/crates/krilla/src/content.rs b/crates/krilla/src/content.rs index 27b5ff82..0d48296b 100644 --- a/crates/krilla/src/content.rs +++ b/crates/krilla/src/content.rs @@ -27,7 +27,7 @@ use crate::graphics::shading_function::{ use crate::graphics::shading_pattern::ShadingPattern; use crate::graphics::tiling_pattern::TilingPattern; use crate::graphics::xobject::XObject; -use crate::interchange::tagging::ContentTag; +use crate::interchange::tagging::{ContentTag, XObjectTagId}; use crate::num::NormalizedF32; use crate::resource; use crate::resource::{Resource, ResourceDictionaryBuilder}; @@ -55,6 +55,12 @@ pub(crate) struct ContentBuilder { /// A temporary buffer that's reused across the builder. scratch: Vec, pub(crate) active_marked_content: bool, + /// If set, this sub-builder is tagged and will produce an XObject with StructParents. + pub(crate) xobject_tag_id: Option, + /// The page index inherited from the parent surface (for MCR /Pg entries). + pub(crate) page_index: Option, + /// Per-XObject MCID counter, incremented by start_tagged in sub-builder context. + pub(crate) mcid_counter: i32, } /// Stores either a device-specific color space, @@ -78,6 +84,9 @@ impl ContentBuilder { bbox: None, scratch: Vec::new(), active_marked_content: false, + xobject_tag_id: None, + page_index: None, + mcid_counter: 0, } } @@ -857,10 +866,26 @@ impl ContentBuilder { ); } - pub(crate) fn draw_masked(&mut self, sc: &mut SerializeContext, mask: Mask, stream: Stream) { + pub(crate) fn draw_masked( + &mut self, + sc: &mut SerializeContext, + mask: Mask, + stream: Stream, + xobject_tag_id: Option, + page_index: Option, + num_mcids: i32, + ) { let state = ExtGState::new().mask(mask, sc); self.uses_mask = true; - let x_object = XObject::new(stream, false, true, None); + let x_object = XObject::new( + stream, + false, + true, + None, + xobject_tag_id, + page_index, + num_mcids, + ); self.draw_xobject(sc, x_object, &state); } @@ -869,17 +894,43 @@ impl ContentBuilder { sc: &mut SerializeContext, opacity: NormalizedF32, stream: Stream, + xobject_tag_id: Option, + page_index: Option, + num_mcids: i32, ) { let state = ExtGState::new() .stroking_alpha(opacity) .non_stroking_alpha(opacity); - let x_object = XObject::new(stream, true, false, None); + let x_object = XObject::new( + stream, + true, + false, + None, + xobject_tag_id, + page_index, + num_mcids, + ); self.draw_xobject(sc, x_object, &state); } - pub(crate) fn draw_isolated(&mut self, sc: &mut SerializeContext, stream: Stream) { + pub(crate) fn draw_isolated( + &mut self, + sc: &mut SerializeContext, + stream: Stream, + xobject_tag_id: Option, + page_index: Option, + num_mcids: i32, + ) { let state = ExtGState::new(); - let x_object = XObject::new(stream, true, false, None); + let x_object = XObject::new( + stream, + true, + false, + None, + xobject_tag_id, + page_index, + num_mcids, + ); self.draw_xobject(sc, x_object, &state); } diff --git a/crates/krilla/src/graphics/graphic.rs b/crates/krilla/src/graphics/graphic.rs index d8c38e82..e6f866a4 100644 --- a/crates/krilla/src/graphics/graphic.rs +++ b/crates/krilla/src/graphics/graphic.rs @@ -24,7 +24,7 @@ impl Graphic { /// from wherever the graphic is invoked. pub fn new(stream: Stream, isolated: bool) -> Self { Self { - x_object: XObject::new(stream, isolated, false, None), + x_object: XObject::new(stream, isolated, false, None, None, None, 0), } } } diff --git a/crates/krilla/src/graphics/mask.rs b/crates/krilla/src/graphics/mask.rs index c3ae15f6..da0a02a0 100644 --- a/crates/krilla/src/graphics/mask.rs +++ b/crates/krilla/src/graphics/mask.rs @@ -109,8 +109,15 @@ impl Cacheable for Mask { fn serialize(self, sc: &mut SerializeContext, root_ref: Ref) -> Deferred { let mut chunk = Chunk::new(); - let x_object = - sc.register_cacheable(XObject::new(self.stream, false, true, self.custom_bbox)); + let x_object = sc.register_cacheable(XObject::new( + self.stream, + false, + true, + self.custom_bbox, + None, + None, + 0, + )); let mut dict = chunk.indirect(root_ref).dict(); dict.pair(Name(b"Type"), Name(b"Mask")); diff --git a/crates/krilla/src/graphics/xobject.rs b/crates/krilla/src/graphics/xobject.rs index 1a9c7553..5dcaabf1 100644 --- a/crates/krilla/src/graphics/xobject.rs +++ b/crates/krilla/src/graphics/xobject.rs @@ -7,6 +7,7 @@ use crate::chunk_container::ChunkContainerFn; use crate::configure::ValidationError; use crate::geom::Rect; use crate::graphics::color::{rgb, DEVICE_RGB}; +use crate::interchange::tagging::XObjectTagId; use crate::resource; use crate::resource::{Resource, Resourceable}; use crate::serialize::{Cacheable, MaybeDeviceColorSpace, SerializeContext}; @@ -19,6 +20,12 @@ struct Repr { isolated: bool, transparency_group_color_space: bool, custom_bbox: Option, + /// If set, this XObject contains tagged content and needs StructParents. + xobject_tag_id: Option, + /// The page index where this XObject is drawn (for parent tree lookup). + page_index: Option, + /// Number of MCIDs allocated in this XObject's content stream. + num_mcids: i32, } #[derive(Debug, Hash, Clone, Eq, PartialEq)] @@ -30,6 +37,9 @@ impl XObject { isolated: bool, mut transparency_group_color_space: bool, custom_bbox: Option, + xobject_tag_id: Option, + page_index: Option, + num_mcids: i32, ) -> Self { // In case a mask was invoked in the content stream, we _always_ create // a new transparency group. Please see . @@ -48,6 +58,9 @@ impl XObject { isolated, transparency_group_color_space, custom_bbox, + xobject_tag_id, + page_index, + num_mcids, }))) } @@ -78,6 +91,15 @@ impl Cacheable for XObject { sc.register_validation_error(ValidationError::Transparency(sc.location)); } + // Register the XObject's tag ref and struct parent before the deferred closure. + let struct_parents_key = if let Some(tag_id) = self.0.xobject_tag_id { + let page_index = self.0.page_index.unwrap(); + sc.register_xobject_tag_ref(tag_id, root_ref); + sc.register_xobject_struct_parent(tag_id, page_index, self.0.num_mcids) + } else { + None + }; + let serialize_settings = sc.serialize_settings(); // "Ordinarily, the CS entry may be present only for isolated transparency groups (those @@ -113,6 +135,10 @@ impl Cacheable for XObject { .to_pdf_rect(), ); + if let Some(key) = struct_parents_key { + x_object.struct_parents(key); + } + if use_transparency_group { let mut group = x_object.group(); let transparency = group.transparency(); diff --git a/crates/krilla/src/interchange/tagging/fmt.rs b/crates/krilla/src/interchange/tagging/fmt.rs index 66c9bdbd..49b3ae51 100644 --- a/crates/krilla/src/interchange/tagging/fmt.rs +++ b/crates/krilla/src/interchange/tagging/fmt.rs @@ -95,6 +95,15 @@ impl Output for Node { ai.page_index, ai.annot_index ) } + Node::Leaf(Identifier(IdentifierInner::Real(IdentifierType::XObjectIdentifier( + xi, + )))) => { + writeln!( + f, + "{indent}- XObject: page={} tag_id={} mcid={}", + xi.page_index, xi.xobject_tag_id.0, xi.mcid + ) + } Node::Leaf(Identifier(IdentifierInner::Dummy)) => writeln!(f, "{indent}- Artifact"), } } diff --git a/crates/krilla/src/interchange/tagging/mod.rs b/crates/krilla/src/interchange/tagging/mod.rs index 9fcba3f8..2856c125 100644 --- a/crates/krilla/src/interchange/tagging/mod.rs +++ b/crates/krilla/src/interchange/tagging/mod.rs @@ -322,6 +322,34 @@ impl<'a> SpanTag<'a> { } } +/// A unique identifier for a tagged sub-builder (XObject). +/// Assigned when creating a sub-builder for transparency groups. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub(crate) struct XObjectTagId(pub(crate) usize); + +/// Tracks MCIDs within a specific XObject content stream. +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub(crate) struct XObjectTagIdentifier { + /// The page where the XObject is drawn (for MCR /Pg reference). + pub(crate) page_index: usize, + /// The unique XObject tag ID. + pub(crate) xobject_tag_id: XObjectTagId, + /// The marked content identifier within this XObject. + pub(crate) mcid: i32, +} + +impl From for IdentifierType { + fn from(value: XObjectTagIdentifier) -> Self { + IdentifierType::XObjectIdentifier(value) + } +} + +impl From for Identifier { + fn from(value: XObjectTagIdentifier) -> Self { + Identifier(IdentifierInner::Real(value.into())) + } +} + #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] pub(crate) struct PageTagIdentifier { pub(crate) page_index: usize, @@ -382,9 +410,11 @@ impl AnnotationIdentifier { } #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +#[allow(clippy::enum_variant_names)] pub(crate) enum IdentifierType { PageIdentifier(PageTagIdentifier), AnnotationIdentifier(AnnotationIdentifier), + XObjectIdentifier(XObjectTagIdentifier), } #[derive(Copy, Clone, Debug, Eq, PartialEq)] @@ -1184,6 +1214,33 @@ fn serialize_children( .page(page_ref) .object(*annotation_ref); } + IdentifierType::XObjectIdentifier(xi) => { + let page_ref = sc + .page_infos() + .get(xi.page_index) + .unwrap_or_else(|| panic!( + "tag tree contains xobject identifier from page {}, but document only has {} pages", + xi.page_index + 1, + sc.page_infos().len() + )) + .ref_(); + + let xobj_ref = sc.xobject_tag_ref(xi.xobject_tag_id).unwrap_or_else(|| { + panic!("xobject tag ref not found for {:?}", xi.xobject_tag_id) + }); + + if parent_tree_map.contains_key(&xi.into()) { + panic!("identifier {xi:?} appears twice in the tag tree"); + } + + parent_tree_map.insert(xi.into(), parent_ref); + + struct_children + .marked_content_ref() + .marked_content_id(xi.mcid) + .page(page_ref) + .stream(xobj_ref); + } }, } } diff --git a/crates/krilla/src/serialize.rs b/crates/krilla/src/serialize.rs index 6853e5f5..c3a55e0a 100644 --- a/crates/krilla/src/serialize.rs +++ b/crates/krilla/src/serialize.rs @@ -25,7 +25,9 @@ use crate::interactive::destination::{NamedDestination, XyzDestination}; use crate::interchange::embed::EmbeddedFile; use crate::interchange::metadata::Metadata; use crate::interchange::outline::Outline; -use crate::interchange::tagging::{AnnotationIdentifier, PageTagIdentifier, TagTree}; +use crate::interchange::tagging::{ + AnnotationIdentifier, PageTagIdentifier, TagTree, XObjectTagId, XObjectTagIdentifier, +}; use crate::page::{InternalPage, PageLabel, PageLabelContainer}; #[cfg(feature = "pdf")] use crate::pdf::{PdfDocument, PdfSerializerContext}; @@ -192,6 +194,8 @@ enum StructParentElement { /// The index of the page where the annotation is present, as well as the index of the /// annotation within that one page. Annotation(AnnotationIdentifier), + /// A tagged XObject with its tag ID, page index, and number of MCIDs. + XObject(XObjectTagId, usize, i32), } #[derive(Debug)] @@ -242,6 +246,10 @@ pub(crate) struct SerializeContext { validation_store: ValidationStore, /// The current location, if set. pub(crate) location: Option, + /// Counter for allocating unique XObject tag IDs. + xobject_tag_counter: usize, + /// Maps XObject tag IDs to their PDF object references. + xobject_tag_refs: HashMap, } impl SerializeContext { @@ -270,6 +278,8 @@ impl SerializeContext { validation_errors: vec![], serialize_settings: Arc::new(serialize_settings), limits: Limits::new(), + xobject_tag_counter: 0, + xobject_tag_refs: HashMap::new(), validation_store: ValidationStore::new(), } } @@ -484,6 +494,41 @@ impl SerializeContext { } } + /// Allocate a new unique XObject tag ID. + pub(crate) fn new_xobject_tag_id(&mut self) -> XObjectTagId { + let id = XObjectTagId(self.xobject_tag_counter); + self.xobject_tag_counter += 1; + id + } + + /// Register the PDF object reference for a tagged XObject. + pub(crate) fn register_xobject_tag_ref(&mut self, tag_id: XObjectTagId, obj_ref: Ref) { + self.xobject_tag_refs.insert(tag_id, obj_ref); + } + + /// Look up the PDF object reference for a tagged XObject. + pub(crate) fn xobject_tag_ref(&self, tag_id: XObjectTagId) -> Option { + self.xobject_tag_refs.get(&tag_id).copied() + } + + /// Register a tagged XObject in the parent tree. + pub(crate) fn register_xobject_struct_parent( + &mut self, + tag_id: XObjectTagId, + page_index: usize, + num_mcids: i32, + ) -> Option { + if self.serialize_settings.enable_tagging && num_mcids > 0 { + let id = self.global_objects.struct_parents.len(); + self.global_objects + .struct_parents + .push(StructParentElement::XObject(tag_id, page_index, num_mcids)); + Some(i32::try_from(id).unwrap()) + } else { + None + } + } + pub(crate) fn register_named_destination(&mut self, nd: NamedDestination) { let dest_ref = self.register_xyz_destination((*nd.xyz_dest).clone()); self.global_objects.named_destinations.insert(nd, dest_ref); @@ -818,6 +863,31 @@ impl SerializeContext { }); tree_nums.insert(index as i32, parent_ref); } + StructParentElement::XObject(tag_id, page_index, num_mcids) => { + // XObjects work like pages: map MCIDs to struct element refs. + let mut list_chunk = Chunk::new(); + let list_ref = self.new_ref(); + + let mut refs = list_chunk.indirect(list_ref).array(); + + for mcid in 0..num_mcids { + let xi = XObjectTagIdentifier { + page_index, + xobject_tag_id: tag_id, + mcid, + }; + refs.item(parent_tree_map.get(&xi.into()).unwrap_or_else(|| { + panic!( + "xobject tag identifier {xi:?} doesn't appear in the tag tree" + ) + })); + } + + refs.finish(); + + sub_chunks.push(list_chunk); + tree_nums.insert(index as i32, list_ref); + } } } diff --git a/crates/krilla/src/surface.rs b/crates/krilla/src/surface.rs index e92a701d..81024e4c 100644 --- a/crates/krilla/src/surface.rs +++ b/crates/krilla/src/surface.rs @@ -20,7 +20,10 @@ use crate::graphics::image::Image; use crate::graphics::mask::Mask; use crate::graphics::paint::{Fill, FillRule, Stroke}; use crate::graphics::shading_function::ShadingFunction; -use crate::interchange::tagging::{ContentTag, Identifier, PageTagIdentifier}; +use crate::interchange::tagging::{ + ContentTag, Identifier, IdentifierInner, IdentifierType, PageTagIdentifier, + XObjectTagIdentifier, +}; use crate::num::NormalizedF32; use crate::paint::{InnerPaint, Paint}; #[cfg(feature = "pdf")] @@ -156,7 +159,48 @@ impl<'a> Surface<'a> { /// # Panics /// Panics if a tagged section has already been started. pub fn start_tagged(&mut self, tag: ContentTag) -> Identifier { - if let Some(id) = &mut self.page_identifier { + // Check if we're in a tagged sub-builder (transparency group) first, + // since page_identifier stays set on the Surface even inside sub-builders. + let tag_info = { + let builder = self.bd.get(); + builder + .xobject_tag_id + .map(|tag_id| (tag_id, builder.page_index.unwrap())) + }; + + if let Some((tag_id, page_index)) = tag_info { + match tag { + ContentTag::Artifact(at) => { + if at.requires_properties() { + self.bd + .get_mut() + .start_marked_content_with_properties(self.sc, None, tag); + } else { + self.bd.get_mut().start_marked_content(tag.name()); + } + Identifier::dummy() + } + ContentTag::Span(_) | ContentTag::Other => { + let mcid = { + let builder = self.bd.get_mut(); + let mcid = builder.mcid_counter; + builder.mcid_counter += 1; + mcid + }; + self.bd.get_mut().start_marked_content_with_properties( + self.sc, + Some(mcid), + tag, + ); + let xi = XObjectTagIdentifier { + page_index, + xobject_tag_id: tag_id, + mcid, + }; + Identifier(IdentifierInner::Real(IdentifierType::XObjectIdentifier(xi))) + } + } + } else if let Some(id) = &mut self.page_identifier { match tag { // An artifact is actually not really part of tagged PDF and doesn't have // a marked content identifier, so we need to return a dummy one here. It's just @@ -214,7 +258,7 @@ impl<'a> Surface<'a> { /// # Panics /// Panics if no tagged section has been started. pub fn end_tagged(&mut self) { - if self.page_identifier.is_some() { + if self.page_identifier.is_some() || self.bd.get().xobject_tag_id.is_some() { self.bd.get_mut().end_marked_content(); } } @@ -410,9 +454,9 @@ impl<'a> Surface<'a> { pub fn push_mask(&mut self, mask: Mask) { self.push_instructions .push(PushInstruction::Mask(Box::new(mask))); - self.bd - .sub_builders - .push(ContentBuilder::new(Transform::identity(), true)); + let mut builder = ContentBuilder::new(Transform::identity(), true); + self.setup_sub_builder_tagging(&mut builder); + self.bd.sub_builders.push(builder); } #[cfg(feature = "pdf")] @@ -455,18 +499,18 @@ impl<'a> Surface<'a> { .push(PushInstruction::Opacity(opacity)); if opacity != NormalizedF32::ONE { - self.bd - .sub_builders - .push(ContentBuilder::new(Transform::identity(), true)); + let mut builder = ContentBuilder::new(Transform::identity(), true); + self.setup_sub_builder_tagging(&mut builder); + self.bd.sub_builders.push(builder); } } /// Push a new isolated layer. pub fn push_isolated(&mut self) { self.push_instructions.push(PushInstruction::Isolated); - self.bd - .sub_builders - .push(ContentBuilder::new(Transform::identity(), true)); + let mut builder = ContentBuilder::new(Transform::identity(), true); + self.setup_sub_builder_tagging(&mut builder); + self.bd.sub_builders.push(builder); } /// Pop the last `push` instruction. @@ -478,19 +522,54 @@ impl<'a> Surface<'a> { PushInstruction::Transform => self.bd.get_mut().restore_graphics_state(), PushInstruction::Opacity(o) => { if o != NormalizedF32::ONE { - let stream = self.bd.sub_builders.pop().unwrap().finish(self.sc); - self.bd.get_mut().draw_opacified(self.sc, o, stream); + let sub = self.bd.sub_builders.pop().unwrap(); + let tag_id = sub.xobject_tag_id; + let page_index = sub.page_index; + let num_mcids = sub.mcid_counter; + let stream = sub.finish(self.sc); + self.bd + .get_mut() + .draw_opacified(self.sc, o, stream, tag_id, page_index, num_mcids); } } PushInstruction::ClipPath => self.bd.get_mut().pop_clip_path(), PushInstruction::BlendMode => self.bd.get_mut().restore_graphics_state(), PushInstruction::Mask(mask) => { - let stream = self.bd.sub_builders.pop().unwrap().finish(self.sc); - self.bd.get_mut().draw_masked(self.sc, *mask, stream) + let sub = self.bd.sub_builders.pop().unwrap(); + let tag_id = sub.xobject_tag_id; + let page_index = sub.page_index; + let num_mcids = sub.mcid_counter; + let stream = sub.finish(self.sc); + self.bd + .get_mut() + .draw_masked(self.sc, *mask, stream, tag_id, page_index, num_mcids) } PushInstruction::Isolated => { - let stream = self.bd.sub_builders.pop().unwrap().finish(self.sc); - self.bd.get_mut().draw_isolated(self.sc, stream); + let sub = self.bd.sub_builders.pop().unwrap(); + let tag_id = sub.xobject_tag_id; + let page_index = sub.page_index; + let num_mcids = sub.mcid_counter; + let stream = sub.finish(self.sc); + self.bd + .get_mut() + .draw_isolated(self.sc, stream, tag_id, page_index, num_mcids); + } + } + } + + /// Set up tagging on a new sub-builder by assigning an XObject tag ID + /// and propagating the page index from the parent context. + fn setup_sub_builder_tagging(&mut self, builder: &mut ContentBuilder) { + if self.sc.serialize_settings().enable_tagging { + // Get page_index from page-level identifier or from parent sub-builder. + let page_index = self + .page_identifier + .map(|pi| pi.page_index) + .or_else(|| self.bd.get().page_index); + + if let Some(page_index) = page_index { + builder.xobject_tag_id = Some(self.sc.new_xobject_tag_id()); + builder.page_index = Some(page_index); } } } @@ -522,7 +601,9 @@ impl<'a> Surface<'a> { pub fn finish(self) {} pub(crate) fn draw_opacified_stream(&mut self, opacity: NormalizedF32, stream: Stream) { - self.bd.get_mut().draw_opacified(self.sc, opacity, stream) + self.bd + .get_mut() + .draw_opacified(self.sc, opacity, stream, None, None, 0) } /// Return the current transformation matrix of the surface. diff --git a/crates/krilla/src/text/type3.rs b/crates/krilla/src/text/type3.rs index 3f6d9556..7e0debae 100644 --- a/crates/krilla/src/text/type3.rs +++ b/crates/krilla/src/text/type3.rs @@ -181,7 +181,7 @@ impl Type3Font { // and showing that, but it seems like many viewers don't like that, and emojis // look messed up. Using XObjects seems like the best choice here. content.start_color_glyph(self.widths[index]); - let x_object = XObject::new(stream, false, false, None); + let x_object = XObject::new(stream, false, false, None, None, None, 0); if !x_object.is_empty() { font_bbox.expand(&x_object.bbox()); let x_name = rd_builder.register_resource(sc.register_resourceable(x_object));