diff --git a/docs-old/contributing/NEW_MODEL_INTEGRATION.md b/docs-old/contributing/NEW_MODEL_INTEGRATION.md index 54fc52fcd77..ca5372c0337 100644 --- a/docs-old/contributing/NEW_MODEL_INTEGRATION.md +++ b/docs-old/contributing/NEW_MODEL_INTEGRATION.md @@ -442,6 +442,7 @@ class NewModelTextEncoderInvocation(BaseInvocation): class NewModelDenoiseInvocation(BaseInvocation): # Standard Fields latents: LatentsField | None = InputField(default=None) + noise: LatentsField | None = InputField(default=None) positive_conditioning: ConditioningField = InputField() negative_conditioning: ConditioningField | None = InputField(default=None) @@ -453,6 +454,7 @@ class NewModelDenoiseInvocation(BaseInvocation): denoising_end: float = InputField(default=1.0, ge=0, le=1) steps: int = InputField(default=20, ge=1) cfg_scale: float = InputField(default=7.0) + add_noise: bool = InputField(default=True) # Image-to-Image / Inpainting denoise_mask: DenoiseMaskField | None = InputField(default=None) @@ -461,16 +463,27 @@ class NewModelDenoiseInvocation(BaseInvocation): scheduler: Literal["euler", "heun", "lcm"] = InputField(default="euler") def invoke(self, context: InvocationContext) -> LatentsOutput: - # 1. Generate noise - noise = get_noise_newmodel(seed, height, width, ...) - - # 2. Pack latents (if needed) - x = pack_newmodel(latents) + # 1. Load or generate noise + if self.noise is not None: + noise = self._load_and_validate_noise(context) + else: + noise = get_noise_newmodel(seed, height, width, ...) - # 3. Compute schedule + # 2. Compute schedule timesteps = get_schedule_newmodel(num_steps, denoising_start, denoising_end) - # 4. Denoising loop + # 3. Prepare init latents and img2img preblend + if latents is not None and self.add_noise: + x = noise * timesteps[0] + latents * (1.0 - timesteps[0]) + elif latents is not None: + x = latents + else: + x = noise + + # 4. Pack latents (if needed) + x = pack_newmodel(x) + + # 5. Denoising loop x = denoise( model=transformer, x=x, @@ -480,12 +493,19 @@ class NewModelDenoiseInvocation(BaseInvocation): inpaint_extension=inpaint_extension, # For inpainting ) - # 5. Unpack latents + # 6. Unpack latents latents = unpack_newmodel(x) return LatentsOutput(latents=latents) ``` +If the architecture supports external noise, the denoise invocation should +accept an optional `noise: LatentsField` input and preserve the existing +seed-driven path when it is not connected. Validate external noise against +the architecture's expected rank, channel count, and spatial shape before +using it. Existing workflows must continue to work unchanged when `noise` is +left disconnected. + ### 4.4 VAE Encode Invocation **File:** `invokeai/app/invocations/[newmodel]_vae_encode.py` @@ -536,6 +556,9 @@ class NewModelVaeDecodeInvocation(BaseInvocation): - [ ] Model loader invocation (`[newmodel]_model_loader.py`) - [ ] Text encoder invocation (`[newmodel]_text_encoder.py`) - [ ] Denoise invocation (`[newmodel]_denoise.py`) +- [ ] Add optional `noise: LatentsField` when the architecture supports + external noise +- [ ] Preserve the seed-driven fallback path when `noise` is not connected - [ ] VAE encode invocation (`[newmodel]_vae_encode.py`) - [ ] VAE decode invocation (`[newmodel]_vae_decode.py`) - [ ] Define output classes (e.g., `NewModelLoaderOutput`) @@ -574,6 +597,11 @@ def get_noise_newmodel( dtype=dtype, ) +# If the architecture supports external noise, extend the standard +# invokeai/app/invocations/noise.py node when the tensor contract can be +# represented there. Only create a dedicated noise invocation when the +# standard noise node cannot express the architecture cleanly. + def pack_newmodel(x: torch.Tensor) -> torch.Tensor: """Pack latents for transformer input. @@ -670,6 +698,13 @@ def denoise( return img ``` +If the architecture supports external noise, the denoise path should accept +validated external noise without changing the legacy seed-driven behavior. +Review img2img and inpaint preblend logic carefully when adding scheduler +support. If the initial latent/noise mix is computed before +`scheduler.set_timesteps()`, confirm that the preblend matches the +scheduler's true first effective sigma or timestep. + ### 5.3 Scheduler (if model-specific) **File:** `invokeai/backend/[newmodel]/schedulers.py` or use existing @@ -690,11 +725,16 @@ NEWMODEL_SCHEDULER_MAP = { ### Backend Sampling and Denoise Checklist - [ ] Noise generation (`get_noise_newmodel()`) +- [ ] Extend `invokeai/app/invocations/noise.py` when the architecture's + noise tensor contract fits the standard architecture selector - [ ] Pack/unpack functions (if transformer-based) - [ ] Schedule generation (`get_schedule_newmodel()`) - [ ] Position ID generation (if needed) - [ ] Implement denoise loop +- [ ] Validate external noise shape and rank if the architecture supports it - [ ] Scheduler integration +- [ ] Verify img2img and inpaint preblend parity with the scheduler's first + effective timestep or sigma - [ ] Inpaint extension integration - [ ] Progress callbacks @@ -847,6 +887,11 @@ if ( } ``` +If the architecture supports external noise, do not require generated +workflows to connect it. Keep the denoise node backward compatible by +leaving `noise` disconnected unless the workflow explicitly needs external +noise. + ### Frontend Graph Building Checklist - [ ] Create graph builder (`buildNewModelGraph.ts`) @@ -1209,6 +1254,25 @@ export const NewModelSchedulerSelect = () => { - [ ] Frontend UI component - [ ] State management +**External Noise:** +- [ ] Add optional `noise: LatentsField` input to the denoise invocation +- [ ] Validate external noise shape against the architecture's expected + latent shape +- [ ] Preserve existing behavior when `noise` is not connected +- [ ] Extend the standard `noise` invocation when the architecture's latent + noise contract can be represented there +- [ ] Add a dedicated architecture-compatible noise invocation only when + the standard `noise` invocation cannot support the architecture cleanly + +If your model supports external noise, the denoise invocation should accept +it as an optional input rather than replacing the existing seed-driven path. +When possible, wire the architecture into the standard `noise` invocation's +architecture selector instead of creating a separate noise node. Only create +a dedicated noise invocation if the architecture has a noise tensor contract +that the standard `noise` invocation cannot express cleanly. When external +noise is connected, validate rank, channel count, and spatial shape before +blending it with init latents or using it as the initial latent state. + --- ## Summary: Minimal Integration @@ -1240,6 +1304,11 @@ For a **minimal txt2img integration**, the following files are required: 3. `src/features/nodes/util/graph/generation/addInpaint.ts` 4. `src/features/nodes/util/graph/generation/addOutpaint.ts` +If the architecture supports external noise, also extend +`invokeai/app/invocations/noise.py` when possible and keep the denoise +invocation's `noise` input optional so existing generated workflows continue +to work without modification. + --- ## Reference: Existing Implementations diff --git a/docs-old/nodes/NODES.md b/docs-old/nodes/NODES.md index e25ef7aa043..93a9b69c9d0 100644 --- a/docs-old/nodes/NODES.md +++ b/docs-old/nodes/NODES.md @@ -43,7 +43,11 @@ There are several node grouping concepts that can be examined with a narrow focu ### Create Latent Noise -An initial noise tensor is necessary for the latent diffusion process. As a result, the Denoising node requires a noise node input. +An initial noise tensor is necessary for the latent diffusion process. As a result, the Denoising node requires a noise node input. + +The standard Create Latent Noise node includes a Noise Type selector for architecture-specific latent shapes. Leave it +at SD for classic 4-channel Stable Diffusion workflows, or switch it to match the downstream denoiser when using +architectures like FLUX, FLUX.2, SD3, CogView4, Z-Image, or Anima. ![groupsnoise](../assets/nodes/groupsnoise.png) @@ -94,4 +98,3 @@ Iteration is a common concept in any processing, and means to repeat a process w Batch or multiple image generation in the workflow editor is done using the RandomRange node. In this case, the 'Size' field represents the number of images to generate, meaning this example will generate 4 images. As RandomRange produces a collection of integers, we need to add the Iterate node to iterate through the collection. This noise can then be fed to the Denoise Latents node for it to iterate through the denoising process with the different seeds provided. ![groupsmultigenseeding](../assets/nodes/groupsmultigenseeding.png) - diff --git a/docs/src/content/docs/development/Guides/models.mdx b/docs/src/content/docs/development/Guides/models.mdx index 8ae8a9f7477..8657cc97818 100644 --- a/docs/src/content/docs/development/Guides/models.mdx +++ b/docs/src/content/docs/development/Guides/models.mdx @@ -224,6 +224,12 @@ This is where the actual mathematical implementation of the model lives. return rearrange(x, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2) ``` + If the architecture supports external noise, prefer extending the standard + `invokeai/app/invocations/noise.py` node's `noise_type` selector instead of + adding a brand new noise node. Only add a dedicated noise invocation when the + architecture's noise tensor rank or layout cannot be expressed by the + standard node. + 2. **The Denoising Loop** Implement the core sampling loop. This interacts with schedulers and handles classifier-free guidance (CFG). @@ -311,6 +317,7 @@ Invocations expose your PyTorch functions as isolated execution nodes in InvokeA @invocation("newmodel_denoise", title="NewModel Denoise", category="latents") class NewModelDenoiseInvocation(BaseInvocation): latents: LatentsField | None = InputField(default=None) + noise: LatentsField | None = InputField(default=None) positive_conditioning: ConditioningField = InputField() transformer: TransformerField = InputField() steps: int = InputField(default=20) @@ -321,6 +328,10 @@ Invocations expose your PyTorch functions as isolated execution nodes in InvokeA pass ``` + If you add external noise support, keep it optional so seed-driven workflows + continue to work. Validate connected noise against the architecture's + expected shape before using it. + 4. **VAE Encode / Decode Invocations** Create nodes to transition between pixel space (images) and latent space. @@ -331,6 +342,7 @@ Invocations expose your PyTorch functions as isolated execution nodes in InvokeA - [ ] Model loader invocation (`[newmodel]_model_loader.py`) - [ ] Text encoder invocation (`[newmodel]_text_encoder.py`) - [ ] Denoise invocation (`[newmodel]_denoise.py`) + - [ ] Extend the standard `noise` invocation if the architecture supports external noise - [ ] VAE encode/decode invocations (`[newmodel]_vae_encode.py`, `[newmodel]_vae_decode.py`) ::: diff --git a/docs/src/content/docs/workflows/editor-interface.mdx b/docs/src/content/docs/workflows/editor-interface.mdx index b4b1d93db45..b4f4cf2fb38 100644 --- a/docs/src/content/docs/workflows/editor-interface.mdx +++ b/docs/src/content/docs/workflows/editor-interface.mdx @@ -73,6 +73,10 @@ The screenshots below aren't examples of complete functioning node graphs, but r ### Create Latent Noise An initial noise tensor is necessary for the latent diffusion process. As a result, the Denoising node requires a noise node input. + The standard **Create Latent Noise** node now includes a **Noise Type** selector for architecture-specific latent + shapes. Leave it at **SD** for classic 4-channel Stable Diffusion workflows, or switch it to the architecture that + matches the downstream denoiser when working with models like FLUX, FLUX.2, SD3, CogView4, Z-Image, or Anima. + ![Create Latent Noise](./assets/groupsnoise.png) ### Text Prompt Conditioning diff --git a/invokeai/app/invocations/anima_denoise.py b/invokeai/app/invocations/anima_denoise.py index 0016a4fd261..2a32f9cd828 100644 --- a/invokeai/app/invocations/anima_denoise.py +++ b/invokeai/app/invocations/anima_denoise.py @@ -34,6 +34,7 @@ InputField, LatentsField, ) +from invokeai.app.invocations.latent_noise import validate_noise_tensor_shape from invokeai.app.invocations.model import TransformerField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.services.shared.invocation_context import InvocationContext @@ -166,7 +167,7 @@ def merge_intermediate_latents_with_init_latents( title="Denoise - Anima", tags=["image", "anima"], category="image", - version="1.5.0", + version="1.6.0", classification=Classification.Prototype, ) class AnimaDenoiseInvocation(BaseInvocation): @@ -182,6 +183,9 @@ class AnimaDenoiseInvocation(BaseInvocation): latents: Optional[LatentsField] = InputField( default=None, description=FieldDescriptions.latents, input=Input.Connection ) + noise: Optional[LatentsField] = InputField( + default=None, description=FieldDescriptions.noise, input=Input.Connection + ) # denoise_mask is used for inpainting. Only the masked region is modified. denoise_mask: Optional[DenoiseMaskField] = InputField( default=None, description=FieldDescriptions.denoise_mask, input=Input.Connection @@ -459,12 +463,27 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: if init_latents.ndim == 4: init_latents = init_latents.unsqueeze(2) # [B, C, H, W] -> [B, C, 1, H, W] - # Generate initial noise (3D latent: [B, C, T, H, W]) - noise = self._get_noise(self.height, self.width, inference_dtype, device, self.seed) + # Generate initial noise (3D latent: [B, C, T, H, W]). + # If noise will never be consumed, avoid validating/loading it. + should_ignore_noise = init_latents is not None and not self.add_noise and self.denoise_mask is None + noise: torch.Tensor | None + if should_ignore_noise: + noise = None + else: + noise = self._prepare_noise_tensor(context, inference_dtype, device) # Prepare input latents if init_latents is not None: if self.add_noise: + assert noise is not None + # Noise the init latents using the first sigma from the clipped + # InvokeAI schedule. + # + # Known limitation: if the selected scheduler later starts from a + # different first effective sigma/timestep than sigmas[0], the + # img2img preblend below may not match that scheduler exactly. + # This is an existing pipeline limitation and affects both + # internally generated noise and externally supplied noise. s_0 = sigmas[0] latents = s_0 * noise + (1.0 - s_0) * init_latents else: @@ -472,6 +491,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: else: if self.denoising_start > 1e-5: raise ValueError("denoising_start should be 0 when initial latents are not provided.") + assert noise is not None latents = noise if total_steps <= 0: @@ -483,6 +503,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: if inpaint_mask is not None: if init_latents is None: raise ValueError("Initial latents are required when using an inpaint mask (image-to-image inpainting)") + assert noise is not None inpaint_extension = AnimaInpaintExtension( init_latents=init_latents.squeeze(2), inpaint_mask=inpaint_mask, @@ -669,6 +690,16 @@ def _run_transformer(ctx: torch.Tensor, x: torch.Tensor, t: torch.Tensor) -> tor # Remove temporal dimension for output: [B, C, 1, H, W] -> [B, C, H, W] return latents.squeeze(2) + def _prepare_noise_tensor( + self, context: InvocationContext, inference_dtype: torch.dtype, device: torch.device + ) -> torch.Tensor: + if self.noise is not None: + noise = context.tensors.load(self.noise.latents_name).to(device=device, dtype=inference_dtype) + validate_noise_tensor_shape(noise, "Anima", self.width, self.height) + return noise + + return self._get_noise(self.height, self.width, inference_dtype, device, self.seed) + def _build_step_callback(self, context: InvocationContext) -> Callable[[PipelineIntermediateState], None]: def step_callback(state: PipelineIntermediateState) -> None: context.util.sd_step_callback(state, BaseModelType.Anima) diff --git a/invokeai/app/invocations/cogview4_denoise.py b/invokeai/app/invocations/cogview4_denoise.py index e8b910f7315..c9563878003 100644 --- a/invokeai/app/invocations/cogview4_denoise.py +++ b/invokeai/app/invocations/cogview4_denoise.py @@ -18,6 +18,7 @@ WithBoard, WithMetadata, ) +from invokeai.app.invocations.latent_noise import validate_noise_tensor_shape from invokeai.app.invocations.model import TransformerField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.services.shared.invocation_context import InvocationContext @@ -34,7 +35,7 @@ title="Denoise - CogView4", tags=["image", "cogview4"], category="latents", - version="1.0.0", + version="1.1.0", classification=Classification.Prototype, ) class CogView4DenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): @@ -44,6 +45,9 @@ class CogView4DenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): latents: Optional[LatentsField] = InputField( default=None, description=FieldDescriptions.latents, input=Input.Connection ) + noise: Optional[LatentsField] = InputField( + default=None, description=FieldDescriptions.noise, input=Input.Connection + ) # denoise_mask is used for image-to-image inpainting. Only the masked region is modified. denoise_mask: Optional[DenoiseMaskField] = InputField( default=None, description=FieldDescriptions.denoise_mask, input=Input.Connection @@ -245,15 +249,7 @@ def _run_diffusion( # Generate initial latent noise. num_channels_latents = transformer_info.model.config.in_channels # type: ignore assert isinstance(num_channels_latents, int) - noise = self._get_noise( - batch_size=1, - num_channels_latents=num_channels_latents, - height=self.height, - width=self.width, - dtype=inference_dtype, - device=device, - seed=self.seed, - ) + noise = self._prepare_noise_tensor(context, num_channels_latents, inference_dtype, device) # Prepare input latent image. if init_latents is not None: @@ -356,6 +352,24 @@ def _run_diffusion( return latents + def _prepare_noise_tensor( + self, context: InvocationContext, num_channels_latents: int, inference_dtype: torch.dtype, device: torch.device + ) -> torch.Tensor: + if self.noise is not None: + noise = context.tensors.load(self.noise.latents_name).to(device=device, dtype=inference_dtype) + validate_noise_tensor_shape(noise, "CogView4", self.width, self.height, num_channels=num_channels_latents) + return noise + + return self._get_noise( + batch_size=1, + num_channels_latents=num_channels_latents, + height=self.height, + width=self.width, + dtype=inference_dtype, + device=device, + seed=self.seed, + ) + def _build_step_callback(self, context: InvocationContext) -> Callable[[PipelineIntermediateState], None]: def step_callback(state: PipelineIntermediateState) -> None: context.util.sd_step_callback(state, BaseModelType.CogView4) diff --git a/invokeai/app/invocations/flux2_denoise.py b/invokeai/app/invocations/flux2_denoise.py index d4239e41420..3b9d3d4ce89 100644 --- a/invokeai/app/invocations/flux2_denoise.py +++ b/invokeai/app/invocations/flux2_denoise.py @@ -21,6 +21,7 @@ InputField, LatentsField, ) +from invokeai.app.invocations.latent_noise import validate_noise_tensor_shape from invokeai.app.invocations.model import TransformerField, VAEField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.services.shared.invocation_context import InvocationContext @@ -53,7 +54,7 @@ "flux2_denoise", title="FLUX2 Denoise", tags=["image", "flux", "flux2", "klein", "denoise"], - category="image", + category="latents", version="1.5.0", classification=Classification.Prototype, ) @@ -69,6 +70,11 @@ class Flux2DenoiseInvocation(BaseInvocation): description=FieldDescriptions.latents, input=Input.Connection, ) + noise: Optional[LatentsField] = InputField( + default=None, + description=FieldDescriptions.noise, + input=Input.Connection, + ) denoise_mask: Optional[DenoiseMaskField] = InputField( default=None, description=FieldDescriptions.denoise_mask, @@ -247,16 +253,16 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: if init_latents is not None: init_latents = init_latents.to(device=device, dtype=inference_dtype) - # Prepare input noise (FLUX.2 uses 32 channels) - noise = get_noise_flux2( - num_samples=1, - height=self.height, - width=self.width, - device=device, - dtype=inference_dtype, - seed=self.seed, - ) - b, _c, latent_h, latent_w = noise.shape + # Prepare input noise (FLUX.2 uses 32 channels). + # If noise will never be consumed, avoid validating/loading it. + should_ignore_noise = init_latents is not None and not self.add_noise and self.denoise_mask is None + noise: Optional[torch.Tensor] + if should_ignore_noise: + noise = None + b, _c, latent_h, latent_w = init_latents.shape + else: + noise = self._prepare_noise_tensor(context, inference_dtype, device) + b, _c, latent_h, latent_w = noise.shape packed_h = latent_h // 2 packed_w = latent_w // 2 @@ -314,6 +320,15 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: # Prepare input latent image if init_latents is not None: if self.add_noise: + assert noise is not None + # Noise the init latents using the first timestep from the clipped + # InvokeAI schedule. + # + # Known limitation: if a scheduler later uses a different first + # effective timestep/sigma than this precomputed schedule, the + # img2img preblend below may not match that scheduler exactly. + # This is an existing pipeline limitation and applies to both + # seed-generated noise and externally supplied noise. t_0 = timesteps[0] x = t_0 * noise + (1.0 - t_0) * init_latents else: @@ -321,6 +336,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: else: if self.denoising_start > 1e-5: raise ValueError("denoising_start should be 0 when initial latents are not provided.") + assert noise is not None x = noise # If len(timesteps) == 1, then short-circuit @@ -337,7 +353,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: # Pack all latent tensors init_latents_packed = pack_flux2(init_latents) if init_latents is not None else None inpaint_mask_packed = pack_flux2(inpaint_mask) if inpaint_mask is not None else None - noise_packed = pack_flux2(noise) + noise_packed = pack_flux2(noise) if noise is not None else None x = pack_flux2(x) # BN normalization for img2img/inpainting: @@ -357,7 +373,8 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: # Also normalize noise for InpaintExtension - it's used to compute # noised_init_latents = noise * t + init_latents * (1-t) # Both operands must be in the same normalized space - noise_packed = self._bn_normalize(noise_packed, bn_mean, bn_std) + if noise_packed is not None: + noise_packed = self._bn_normalize(noise_packed, bn_mean, bn_std) # For img2img/inpainting, x is computed from init_latents and must also be normalized # For txt2img, x is pure noise (already N(0,1)) - normalizing it would be incorrect # We detect img2img by checking if init_latents was provided @@ -371,6 +388,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: inpaint_extension: Optional[RectifiedFlowInpaintExtension] = None if inpaint_mask_packed is not None: assert init_latents_packed is not None + assert noise_packed is not None inpaint_extension = RectifiedFlowInpaintExtension( init_latents=init_latents_packed, inpaint_mask=inpaint_mask_packed, @@ -385,8 +403,13 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: is_inpainting = self.denoise_mask is not None or self.denoising_start > 1e-5 # Create scheduler with FLUX.2 Klein configuration - # For inpainting/img2img, use manual Euler stepping to preserve the exact timestep schedule - # For txt2img, use the scheduler with dynamic shifting for optimal results + # For inpainting/img2img, use manual Euler stepping to preserve the exact + # clipped timestep schedule used for the initial latent/noise preblend. + # For txt2img, use the scheduler with dynamic shifting for optimal results. + # + # This split is intentional. Reusing a scheduler for img2img here can + # change the first effective timestep/sigma and break parity with the + # preblend computed above. scheduler = None if self.scheduler in FLUX_SCHEDULER_MAP and not is_inpainting: # Only use scheduler for txt2img - use manual Euler for inpainting to preserve exact timesteps @@ -495,6 +518,23 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: x = unpack_flux2(x.float(), self.height, self.width) return x + def _prepare_noise_tensor( + self, context: InvocationContext, inference_dtype: torch.dtype, device: torch.device + ) -> torch.Tensor: + if self.noise is not None: + noise = context.tensors.load(self.noise.latents_name).to(device=device, dtype=inference_dtype) + validate_noise_tensor_shape(noise, "FLUX.2", self.width, self.height) + return noise + + return get_noise_flux2( + num_samples=1, + height=self.height, + width=self.width, + device=device, + dtype=inference_dtype, + seed=self.seed, + ) + def _prep_inpaint_mask(self, context: InvocationContext, latents: torch.Tensor) -> Optional[torch.Tensor]: """Prepare the inpaint mask.""" if self.denoise_mask is None: diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index 1ad0cc559ed..06147229232 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -26,6 +26,7 @@ from invokeai.app.invocations.flux_controlnet import FluxControlNetField from invokeai.app.invocations.flux_vae_encode import FluxVaeEncodeInvocation from invokeai.app.invocations.ip_adapter import IPAdapterField +from invokeai.app.invocations.latent_noise import validate_noise_tensor_shape from invokeai.app.invocations.model import ControlLoRAField, LoRAField, TransformerField, VAEField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.services.shared.invocation_context import InvocationContext @@ -71,7 +72,7 @@ title="FLUX Denoise", tags=["image", "flux"], category="latents", - version="4.5.1", + version="4.6.0", ) class FluxDenoiseInvocation(BaseInvocation): """Run denoising process with a FLUX transformer model.""" @@ -82,6 +83,11 @@ class FluxDenoiseInvocation(BaseInvocation): description=FieldDescriptions.latents, input=Input.Connection, ) + noise: Optional[LatentsField] = InputField( + default=None, + description=FieldDescriptions.noise, + input=Input.Connection, + ) # denoise_mask is used for image-to-image inpainting. Only the masked region is modified. denoise_mask: Optional[DenoiseMaskField] = InputField( default=None, @@ -211,22 +217,23 @@ def _run_diffusion( context: InvocationContext, ): inference_dtype = torch.bfloat16 + device = TorchDevice.choose_torch_device() # Load the input latents, if provided. init_latents = context.tensors.load(self.latents.latents_name) if self.latents else None if init_latents is not None: - init_latents = init_latents.to(device=TorchDevice.choose_torch_device(), dtype=inference_dtype) + init_latents = init_latents.to(device=device, dtype=inference_dtype) # Prepare input noise. - noise = get_noise( - num_samples=1, - height=self.height, - width=self.width, - device=TorchDevice.choose_torch_device(), - dtype=inference_dtype, - seed=self.seed, - ) - b, _c, latent_h, latent_w = noise.shape + # If noise will never be consumed, avoid validating/loading it. + should_ignore_noise = init_latents is not None and not self.add_noise and self.denoise_mask is None + noise: Optional[torch.Tensor] + if should_ignore_noise: + noise = None + b, _c, latent_h, latent_w = init_latents.shape + else: + noise = self._prepare_noise_tensor(context, inference_dtype, device) + b, _c, latent_h, latent_w = noise.shape packed_h = latent_h // 2 packed_w = latent_w // 2 @@ -237,7 +244,7 @@ def _run_diffusion( packed_height=packed_h, packed_width=packed_w, dtype=inference_dtype, - device=TorchDevice.choose_torch_device(), + device=device, ) neg_text_conditionings: list[FluxTextConditioning] | None = None if self.negative_text_conditioning is not None: @@ -247,14 +254,14 @@ def _run_diffusion( packed_height=packed_h, packed_width=packed_w, dtype=inference_dtype, - device=TorchDevice.choose_torch_device(), + device=device, ) redux_conditionings: list[FluxReduxConditioning] = self._load_redux_conditioning( context=context, redux_cond_field=self.redux_conditioning, packed_height=packed_h, packed_width=packed_w, - device=TorchDevice.choose_torch_device(), + device=device, dtype=inference_dtype, ) pos_regional_prompting_extension = RegionalPromptingExtension.from_text_conditioning( @@ -307,7 +314,16 @@ def _run_diffusion( ) if self.add_noise: - # Noise the orig_latents by the appropriate amount for the first timestep. + assert noise is not None + # Noise the orig_latents by the appropriate amount for the first + # timestep in InvokeAI's clipped schedule. + # + # Known limitation: if the selected scheduler later replaces this + # schedule with its own first effective timestep/sigma (for example + # Heun internal expansion or LCM's scheduler-defined schedule), the + # img2img preblend below may not match that scheduler's true first + # step exactly. This is an existing pipeline limitation and affects + # both internally generated noise and externally supplied noise. t_0 = timesteps[0] x = t_0 * noise + (1.0 - t_0) * init_latents else: @@ -317,6 +333,7 @@ def _run_diffusion( if self.denoising_start > 1e-5: raise ValueError("denoising_start should be 0 when initial latents are not provided.") + assert noise is not None x = noise # If len(timesteps) == 1, then short-circuit. We are just noising the input latents, but not taking any @@ -331,9 +348,7 @@ def _run_diffusion( img_cond: torch.Tensor | None = None is_flux_fill = transformer_config.variant is FluxVariantType.DevFill if is_flux_fill: - img_cond = self._prep_flux_fill_img_cond( - context, device=TorchDevice.choose_torch_device(), dtype=inference_dtype - ) + img_cond = self._prep_flux_fill_img_cond(context, device=device, dtype=inference_dtype) else: if self.fill_conditioning is not None: raise ValueError("fill_conditioning was provided, but the model is not a FLUX Fill model.") @@ -359,6 +374,7 @@ def _run_diffusion( inpaint_extension: RectifiedFlowInpaintExtension | None = None if inpaint_mask is not None: assert init_latents is not None + assert noise is not None inpaint_extension = RectifiedFlowInpaintExtension( init_latents=init_latents, inpaint_mask=inpaint_mask, @@ -391,7 +407,7 @@ def _run_diffusion( if isinstance(self.kontext_conditioning, list) else [self.kontext_conditioning], vae_field=self.controlnet_vae, - device=TorchDevice.choose_torch_device(), + device=device, dtype=inference_dtype, ) @@ -508,6 +524,23 @@ def _run_diffusion( x = unpack(x.float(), self.height, self.width) return x + def _prepare_noise_tensor( + self, context: InvocationContext, inference_dtype: torch.dtype, device: torch.device + ) -> torch.Tensor: + if self.noise is not None: + noise = context.tensors.load(self.noise.latents_name).to(device=device, dtype=inference_dtype) + validate_noise_tensor_shape(noise, "FLUX", self.width, self.height) + return noise + + return get_noise( + num_samples=1, + height=self.height, + width=self.width, + device=device, + dtype=inference_dtype, + seed=self.seed, + ) + def _load_text_conditioning( self, context: InvocationContext, diff --git a/invokeai/app/invocations/latent_noise.py b/invokeai/app/invocations/latent_noise.py new file mode 100644 index 00000000000..266ea87f542 --- /dev/null +++ b/invokeai/app/invocations/latent_noise.py @@ -0,0 +1,136 @@ +from typing import Literal + +import torch + +from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR +from invokeai.backend.util.devices import TorchDevice + +LatentNoiseType = Literal["SD", "FLUX", "FLUX.2", "SD3", "CogView4", "Z-Image", "Anima"] + + +def validate_noise_dimensions(noise_type: LatentNoiseType, width: int, height: int) -> None: + multiple_of = 8 + if noise_type in ("FLUX", "FLUX.2", "SD3", "Z-Image"): + multiple_of = 16 + elif noise_type == "CogView4": + multiple_of = 32 + + if width % multiple_of != 0 or height % multiple_of != 0: + raise ValueError(f"{noise_type} noise width and height must be a multiple of {multiple_of}") + + +def get_expected_noise_shape( + noise_type: LatentNoiseType, width: int, height: int, num_channels: int | None = None +) -> tuple[int, ...]: + validate_noise_dimensions(noise_type, width, height) + + if noise_type == "SD": + return (1, 4, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + if noise_type == "FLUX": + return (1, 16, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + if noise_type == "FLUX.2": + return (1, 32, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + if noise_type == "SD3": + return (1, 16, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + if noise_type == "CogView4": + return (1, 16, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + if noise_type == "Z-Image": + return (1, 16, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + if noise_type == "Anima": + return (1, 16, 1, height // LATENT_SCALE_FACTOR, width // LATENT_SCALE_FACTOR) + raise ValueError(f"Unsupported noise type: {noise_type}") + + +def validate_noise_tensor_shape( + noise: torch.Tensor, noise_type: LatentNoiseType, width: int, height: int, num_channels: int | None = None +) -> None: + expected_shape = get_expected_noise_shape(noise_type, width, height, num_channels) + if tuple(noise.shape) != expected_shape: + raise ValueError(f"Expected noise with shape {expected_shape}, got {tuple(noise.shape)}") + + +def generate_noise_tensor( + noise_type: LatentNoiseType, + width: int, + height: int, + seed: int, + device: torch.device, + dtype: torch.dtype, + use_cpu: bool = True, +) -> torch.Tensor: + validate_noise_dimensions(noise_type, width, height) + rand_device = "cpu" if use_cpu else device.type + rand_dtype = TorchDevice.choose_torch_dtype(device=device) + + if noise_type == "SD": + return torch.randn( + 1, + 4, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + dtype=rand_dtype, + device=rand_device, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + if noise_type == "FLUX": + return torch.randn( + 1, + 16, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + device=rand_device, + dtype=rand_dtype, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + if noise_type == "FLUX.2": + return torch.randn( + 1, + 32, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + device=rand_device, + dtype=rand_dtype, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + if noise_type == "SD3": + return torch.randn( + 1, + 16, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + device=rand_device, + dtype=rand_dtype, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + if noise_type == "CogView4": + return torch.randn( + 1, + 16, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + device=rand_device, + dtype=rand_dtype, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + if noise_type == "Z-Image": + return torch.randn( + 1, + 16, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + device=rand_device, + dtype=torch.float32, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + if noise_type == "Anima": + return torch.randn( + 1, + 16, + 1, + height // LATENT_SCALE_FACTOR, + width // LATENT_SCALE_FACTOR, + device=rand_device, + dtype=torch.float32, + generator=torch.Generator(device=rand_device).manual_seed(seed), + ).to("cpu") + raise ValueError(f"Unsupported noise type: {noise_type}") diff --git a/invokeai/app/invocations/metadata_linked.py b/invokeai/app/invocations/metadata_linked.py index 53f2ea74716..cd733fab648 100644 --- a/invokeai/app/invocations/metadata_linked.py +++ b/invokeai/app/invocations/metadata_linked.py @@ -717,7 +717,13 @@ def _loras_to_json(obj: Union[Any, list[Any]]): md.update({"denoising_start": self.denoising_start}) md.update({"denoising_end": self.denoising_end}) md.update({"model": self.transformer.transformer}) - md.update({"seed": self.seed}) + md.update( + { + "seed": self.noise.seed + if self.noise is not None and self.noise.seed is not None and (self.latents is None or self.add_noise) + else self.seed + } + ) md.update({"cfg_scale": self.cfg_scale}) md.update({"cfg_scale_start_step": self.cfg_scale_start_step}) md.update({"cfg_scale_end_step": self.cfg_scale_end_step}) @@ -735,7 +741,7 @@ def _loras_to_json(obj: Union[Any, list[Any]]): title=f"{ZImageDenoiseInvocation.UIConfig.title} + Metadata", tags=["z-image", "latents", "denoise", "txt2img", "t2i", "t2l", "img2img", "i2i", "l2l"], category="metadata", - version="1.0.0", + version="1.1.0", ) class ZImageDenoiseMetaInvocation(ZImageDenoiseInvocation, WithMetadata): """Run denoising process with a Z-Image transformer model + metadata.""" @@ -766,7 +772,13 @@ def _loras_to_json(obj: Union[Any, list[Any]]): md.update({"denoising_end": self.denoising_end}) md.update({"scheduler": self.scheduler}) md.update({"model": self.transformer.transformer}) - md.update({"seed": self.seed}) + md.update( + { + "seed": self.noise.seed + if self.noise is not None and self.noise.seed is not None and (self.latents is None or self.add_noise) + else self.seed + } + ) if len(self.transformer.loras) > 0: md.update({"loras": _loras_to_json(self.transformer.loras)}) diff --git a/invokeai/app/invocations/noise.py b/invokeai/app/invocations/noise.py index 02b917ebf7c..cfac3f112a9 100644 --- a/invokeai/app/invocations/noise.py +++ b/invokeai/app/invocations/noise.py @@ -1,57 +1,17 @@ -# Copyright (c) 2023 Kyle Schouviller (https://github.com/kyle0654) & the InvokeAI Team - - import torch from pydantic import field_validator from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR from invokeai.app.invocations.fields import FieldDescriptions, InputField, LatentsField, OutputField +from invokeai.app.invocations.latent_noise import ( + LatentNoiseType, + generate_noise_tensor, +) from invokeai.app.services.shared.invocation_context import InvocationContext from invokeai.app.util.misc import SEED_MAX from invokeai.backend.util.devices import TorchDevice -""" -Utilities -""" - - -def get_noise( - width: int, - height: int, - device: torch.device, - seed: int = 0, - latent_channels: int = 4, - downsampling_factor: int = 8, - use_cpu: bool = True, - perlin: float = 0.0, -): - """Generate noise for a given image size.""" - noise_device_type = "cpu" if use_cpu else device.type - - # limit noise to only the diffusion image channels, not the mask channels - input_channels = min(latent_channels, 4) - generator = torch.Generator(device=noise_device_type).manual_seed(seed) - - noise_tensor = torch.randn( - [ - 1, - input_channels, - height // downsampling_factor, - width // downsampling_factor, - ], - dtype=TorchDevice.choose_torch_dtype(device=device), - device=noise_device_type, - generator=generator, - ).to("cpu") - - return noise_tensor - - -""" -Nodes -""" - @invocation_output("noise_output") class NoiseOutput(BaseInvocationOutput): @@ -65,8 +25,8 @@ class NoiseOutput(BaseInvocationOutput): def build(cls, latents_name: str, latents: torch.Tensor, seed: int) -> "NoiseOutput": return cls( noise=LatentsField(latents_name=latents_name, seed=seed), - width=latents.size()[3] * LATENT_SCALE_FACTOR, - height=latents.size()[2] * LATENT_SCALE_FACTOR, + width=latents.shape[-1] * LATENT_SCALE_FACTOR, + height=latents.shape[-2] * LATENT_SCALE_FACTOR, ) @@ -75,10 +35,12 @@ def build(cls, latents_name: str, latents: torch.Tensor, seed: int) -> "NoiseOut title="Create Latent Noise", tags=["latents", "noise"], category="latents", - version="1.0.3", + version="1.1.0", ) class NoiseInvocation(BaseInvocation): - """Generates latent noise.""" + """Generates latent noise for supported denoiser architectures.""" + + noise_type: LatentNoiseType = InputField(default="SD", description="Architecture-specific noise type.") seed: int = InputField( default=0, @@ -109,11 +71,13 @@ def modulo_seed(cls, v): return v % (SEED_MAX + 1) def invoke(self, context: InvocationContext) -> NoiseOutput: - noise = get_noise( + noise = generate_noise_tensor( + noise_type=self.noise_type, width=self.width, height=self.height, device=TorchDevice.choose_torch_device(), seed=self.seed, + dtype=TorchDevice.choose_torch_dtype(), use_cpu=self.use_cpu, ) name = context.tensors.save(tensor=noise) diff --git a/invokeai/app/invocations/sd3_denoise.py b/invokeai/app/invocations/sd3_denoise.py index 4b990ee42bb..5e7f4b28999 100644 --- a/invokeai/app/invocations/sd3_denoise.py +++ b/invokeai/app/invocations/sd3_denoise.py @@ -18,6 +18,7 @@ WithBoard, WithMetadata, ) +from invokeai.app.invocations.latent_noise import validate_noise_tensor_shape from invokeai.app.invocations.model import TransformerField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.invocations.sd3_text_encoder import SD3_T5_MAX_SEQ_LEN @@ -35,7 +36,7 @@ title="Denoise - SD3", tags=["image", "sd3"], category="latents", - version="1.1.1", + version="1.2.0", ) class SD3DenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): """Run denoising process with a SD3 model.""" @@ -44,6 +45,9 @@ class SD3DenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): latents: Optional[LatentsField] = InputField( default=None, description=FieldDescriptions.latents, input=Input.Connection ) + noise: Optional[LatentsField] = InputField( + default=None, description=FieldDescriptions.noise, input=Input.Connection + ) # denoise_mask is used for image-to-image inpainting. Only the masked region is modified. denoise_mask: Optional[DenoiseMaskField] = InputField( default=None, description=FieldDescriptions.denoise_mask, input=Input.Connection @@ -235,15 +239,7 @@ def _run_diffusion( # Generate initial latent noise. num_channels_latents = transformer_info.model.config.in_channels assert isinstance(num_channels_latents, int) - noise = self._get_noise( - num_samples=1, - num_channels_latents=num_channels_latents, - height=self.height, - width=self.width, - dtype=inference_dtype, - device=device, - seed=self.seed, - ) + noise = self._prepare_noise_tensor(context, num_channels_latents, inference_dtype, device) # Prepare input latent image. if init_latents is not None: @@ -330,6 +326,24 @@ def _run_diffusion( return latents + def _prepare_noise_tensor( + self, context: InvocationContext, num_channels_latents: int, inference_dtype: torch.dtype, device: torch.device + ) -> torch.Tensor: + if self.noise is not None: + noise = context.tensors.load(self.noise.latents_name).to(device=device, dtype=inference_dtype) + validate_noise_tensor_shape(noise, "SD3", self.width, self.height, num_channels=num_channels_latents) + return noise + + return self._get_noise( + num_samples=1, + num_channels_latents=num_channels_latents, + height=self.height, + width=self.width, + dtype=inference_dtype, + device=device, + seed=self.seed, + ) + def _build_step_callback(self, context: InvocationContext) -> Callable[[PipelineIntermediateState], None]: def step_callback(state: PipelineIntermediateState) -> None: context.util.sd_step_callback(state, BaseModelType.StableDiffusion3) diff --git a/invokeai/app/invocations/z_image_denoise.py b/invokeai/app/invocations/z_image_denoise.py index 397e9171129..c1e864ea179 100644 --- a/invokeai/app/invocations/z_image_denoise.py +++ b/invokeai/app/invocations/z_image_denoise.py @@ -21,6 +21,7 @@ LatentsField, ZImageConditioningField, ) +from invokeai.app.invocations.latent_noise import validate_noise_tensor_shape from invokeai.app.invocations.model import TransformerField, VAEField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.invocations.z_image_control import ZImageControlField @@ -50,7 +51,7 @@ title="Denoise - Z-Image", tags=["image", "z-image"], category="latents", - version="1.5.0", + version="1.6.0", classification=Classification.Prototype, ) class ZImageDenoiseInvocation(BaseInvocation): @@ -63,6 +64,9 @@ class ZImageDenoiseInvocation(BaseInvocation): latents: Optional[LatentsField] = InputField( default=None, description=FieldDescriptions.latents, input=Input.Connection ) + noise: Optional[LatentsField] = InputField( + default=None, description=FieldDescriptions.noise, input=Input.Connection + ) # denoise_mask is used for image-to-image inpainting. Only the masked region is modified. denoise_mask: Optional[DenoiseMaskField] = InputField( default=None, description=FieldDescriptions.denoise_mask, input=Input.Connection @@ -348,22 +352,27 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: if init_latents is not None: init_latents = init_latents.to(device=device, dtype=inference_dtype) - # Generate initial noise - num_channels_latents = 16 # Z-Image uses 16 latent channels - noise = self._get_noise( - batch_size=1, - num_channels_latents=num_channels_latents, - height=self.height, - width=self.width, - dtype=inference_dtype, - device=device, - seed=self.seed, - ) + # Generate initial noise. + # If noise will never be consumed, avoid validating/loading it. + should_ignore_noise = init_latents is not None and not self.add_noise and self.denoise_mask is None + noise: torch.Tensor | None + if should_ignore_noise: + noise = None + else: + noise = self._prepare_noise_tensor(context, inference_dtype, device) # Prepare input latent image if init_latents is not None: if self.add_noise: - # Noise the init_latents by the appropriate amount for the first timestep. + assert noise is not None + # Noise the init latents using the first sigma from the clipped + # InvokeAI schedule. + # + # Known limitation: if the selected scheduler later starts from a + # different first effective sigma/timestep than sigmas[0], the + # img2img preblend below may not match that scheduler exactly. + # This is an existing pipeline limitation and affects both + # internally generated noise and externally supplied noise. s_0 = sigmas[0] latents = s_0 * noise + (1.0 - s_0) * init_latents else: @@ -371,6 +380,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: else: if self.denoising_start > 1e-5: raise ValueError("denoising_start should be 0 when initial latents are not provided.") + assert noise is not None latents = noise # Short-circuit if no denoising steps @@ -383,6 +393,7 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: if inpaint_mask is not None: if init_latents is None: raise ValueError("Initial latents are required when using an inpaint mask (image-to-image inpainting)") + assert noise is not None inpaint_extension = RectifiedFlowInpaintExtension( init_latents=init_latents, inpaint_mask=inpaint_mask, @@ -408,7 +419,9 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: if not is_lcm and "sigmas" in set_timesteps_sig.parameters: scheduler.set_timesteps(sigmas=sigmas, device=device) else: - # LCM or scheduler doesn't support custom sigmas - use num_inference_steps + # LCM or a scheduler without custom-sigma support computes its own + # schedule from num_inference_steps. That can diverge from sigmas[0] + # used in the img2img preblend above. scheduler.set_timesteps(num_inference_steps=total_steps, device=device) # For Heun scheduler, the number of actual steps may differ @@ -762,6 +775,24 @@ def _run_diffusion(self, context: InvocationContext) -> torch.Tensor: return latents + def _prepare_noise_tensor( + self, context: InvocationContext, inference_dtype: torch.dtype, device: torch.device + ) -> torch.Tensor: + if self.noise is not None: + noise = context.tensors.load(self.noise.latents_name).to(device=device, dtype=inference_dtype) + validate_noise_tensor_shape(noise, "Z-Image", self.width, self.height) + return noise + + return self._get_noise( + batch_size=1, + num_channels_latents=16, + height=self.height, + width=self.width, + dtype=inference_dtype, + device=device, + seed=self.seed, + ) + def _build_step_callback(self, context: InvocationContext) -> Callable[[PipelineIntermediateState], None]: def step_callback(state: PipelineIntermediateState) -> None: context.util.sd_step_callback(state, BaseModelType.ZImage) diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index a0ae4cfb3f3..0f4cf07ee5b 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -58,7 +58,11 @@ def denoise( scheduler.set_timesteps(sigmas=timesteps, device=img.device) else: # LCM or scheduler doesn't support custom sigmas - use num_inference_steps - # The schedule will be computed by the scheduler itself + # The schedule will be computed by the scheduler itself. + # + # Important for img2img callers: if the initial latent/noise blend was + # computed from a separate pre-scheduler schedule, that preblend may not + # match this scheduler's true first step exactly. num_inference_steps = len(timesteps) - 1 scheduler.set_timesteps(num_inference_steps=num_inference_steps, device=img.device) diff --git a/invokeai/backend/flux2/denoise.py b/invokeai/backend/flux2/denoise.py index b4438094f7b..2ff66236ce8 100644 --- a/invokeai/backend/flux2/denoise.py +++ b/invokeai/backend/flux2/denoise.py @@ -106,7 +106,14 @@ def denoise( scheduler.set_timesteps(sigmas=sigmas.tolist(), device=img.device) else: # Scheduler doesn't support sigmas (e.g., Heun, LCM) - use num_inference_steps - scheduler.set_timesteps(num_inference_steps=len(sigmas), device=img.device) + # + # Important for img2img callers: if the initial latent/noise blend was + # computed from a separate pre-scheduler schedule, that preblend may not + # match this scheduler's true first step exactly. + scheduler_kwargs: dict[str, Any] = {"num_inference_steps": len(sigmas), "device": img.device} + if mu is not None and "mu" in set_timesteps_sig.parameters: + scheduler_kwargs["mu"] = mu + scheduler.set_timesteps(**scheduler_kwargs) num_scheduler_steps = len(scheduler.timesteps) is_heun = hasattr(scheduler, "state_in_first_order") user_step = 0 diff --git a/invokeai/frontend/web/openapi.json b/invokeai/frontend/web/openapi.json index 76e96cfd870..6c92db519ac 100644 --- a/invokeai/frontend/web/openapi.json +++ b/invokeai/frontend/web/openapi.json @@ -9489,7 +9489,7 @@ "tags": ["image", "anima"], "title": "Denoise - Anima", "type": "object", - "version": "1.4.0", + "version": "1.6.0", "output": { "$ref": "#/components/schemas/LatentsOutput" } diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index 68f24a26ec1..e9de97db96d 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -3105,6 +3105,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -5764,6 +5769,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -10176,6 +10186,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -10912,6 +10927,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -11106,6 +11126,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -23942,7 +23967,7 @@ export type components = { }; /** * Create Latent Noise - * @description Generates latent noise. + * @description Generates latent noise for supported denoiser architectures. */ NoiseInvocation: { /** @@ -23962,6 +23987,13 @@ export type components = { * @default true */ use_cache?: boolean; + /** + * Noise Type + * @description Architecture-specific noise type. + * @default SD + * @enum {string} + */ + noise_type?: "SD" | "FLUX" | "FLUX.2" | "SD3" | "CogView4" | "Z-Image" | "Anima"; /** * Seed * @description Seed for random number generation @@ -26699,6 +26731,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -32173,6 +32210,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null @@ -32306,6 +32348,11 @@ export type components = { * @default null */ latents?: components["schemas"]["LatentsField"] | null; + /** + * @description Noise tensor + * @default null + */ + noise?: components["schemas"]["LatentsField"] | null; /** * @description A mask of the region to apply the denoising process to. Values of 0.0 represent the regions to be fully denoised, and 1.0 represent the regions to be preserved. * @default null diff --git a/tests/app/invocations/test_denoise_noise_inputs.py b/tests/app/invocations/test_denoise_noise_inputs.py new file mode 100644 index 00000000000..556c2e3955b --- /dev/null +++ b/tests/app/invocations/test_denoise_noise_inputs.py @@ -0,0 +1,664 @@ +import inspect +from types import SimpleNamespace +from unittest.mock import MagicMock, patch + +import pytest +import torch + +from invokeai.app.invocations.anima_denoise import AnimaDenoiseInvocation +from invokeai.app.invocations.cogview4_denoise import CogView4DenoiseInvocation +from invokeai.app.invocations.flux2_denoise import Flux2DenoiseInvocation +from invokeai.app.invocations.flux_denoise import FluxDenoiseInvocation +from invokeai.app.invocations.metadata_linked import FluxDenoiseLatentsMetaInvocation, ZImageDenoiseMetaInvocation +from invokeai.app.invocations.primitives import LatentsOutput +from invokeai.app.invocations.sd3_denoise import SD3DenoiseInvocation +from invokeai.app.invocations.z_image_denoise import ZImageDenoiseInvocation +from invokeai.backend.flux.sampling_utils import clip_timestep_schedule_fractional, get_schedule +from invokeai.backend.flux.schedulers import ANIMA_SCHEDULER_MAP, FLUX_SCHEDULER_MAP, ZIMAGE_SCHEDULER_MAP +from invokeai.backend.flux2.sampling_utils import get_schedule_flux2 +from invokeai.backend.model_manager.taxonomy import BaseModelType, ModelType + + +def test_flux_prepare_noise_uses_external_noise(): + invocation = FluxDenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + expected = torch.zeros(1, 16, 8, 8) + mock_context.tensors.load.return_value = expected + + with patch("invokeai.app.invocations.flux_denoise.get_noise") as mock_get_noise: + noise = invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + assert torch.equal(noise, expected.to(dtype=torch.bfloat16)) + mock_get_noise.assert_not_called() + + +def test_flux_prepare_noise_rejects_invalid_shape(): + invocation = FluxDenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = torch.zeros(1, 15, 8, 8) + + with pytest.raises(ValueError, match="Expected noise with shape"): + invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + +def test_flux_add_noise_false_ignores_connected_noise(): + invocation = FluxDenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + noise=MagicMock(latents_name="noise"), + add_noise=False, + width=64, + height=64, + num_steps=4, + denoising_start=0.25, + denoising_end=0.25, + positive_text_conditioning=MagicMock(conditioning_name="positive"), + transformer=MagicMock(transformer="transformer"), + seed=123, + ) + init_latents = torch.full((1, 16, 8, 8), 2.0) + dummy_conditioning = SimpleNamespace( + t5_embeds=torch.zeros(1, 4, 16), + clip_embeds=torch.zeros(1, 768), + to=lambda **_: dummy_conditioning, + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + mock_context.conditioning.load.return_value = SimpleNamespace(conditionings=[dummy_conditioning]) + mock_context.models.get_config.return_value = SimpleNamespace( + base=BaseModelType.Flux, type=ModelType.Main, variant=None + ) + + with ( + patch( + "invokeai.app.invocations.flux_denoise.TorchDevice.choose_torch_device", return_value=torch.device("cpu") + ), + patch("invokeai.app.invocations.flux_denoise.FLUXConditioningInfo", object), + patch( + "invokeai.app.invocations.flux_denoise.RegionalPromptingExtension.from_text_conditioning", + return_value=MagicMock(), + ), + patch.object(invocation, "_prepare_noise_tensor", side_effect=AssertionError("noise should be ignored")), + patch.object(invocation, "_load_redux_conditioning", return_value=[]), + patch("invokeai.app.invocations.flux_denoise.get_schedule", return_value=[0.75]), + ): + result = invocation._run_diffusion(mock_context) + + assert torch.equal(result, init_latents) + + +def test_flux2_prepare_noise_uses_external_noise(): + invocation = Flux2DenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + expected = torch.zeros(1, 32, 8, 8) + mock_context.tensors.load.return_value = expected + + with patch("invokeai.app.invocations.flux2_denoise.get_noise_flux2") as mock_get_noise: + noise = invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + assert torch.equal(noise, expected.to(dtype=torch.bfloat16)) + mock_get_noise.assert_not_called() + + +def test_flux2_prepare_noise_rejects_invalid_shape(): + invocation = Flux2DenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = torch.zeros(1, 16, 8, 8) + + with pytest.raises(ValueError, match="Expected noise with shape"): + invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + +def test_sd3_prepare_noise_uses_external_noise(): + invocation = SD3DenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + expected = torch.zeros(1, 16, 8, 8) + mock_context.tensors.load.return_value = expected + + with patch.object(invocation, "_get_noise") as mock_get_noise: + noise = invocation._prepare_noise_tensor(mock_context, 16, torch.bfloat16, torch.device("cpu")) + + assert torch.equal(noise, expected.to(dtype=torch.bfloat16)) + mock_get_noise.assert_not_called() + + +def test_sd3_prepare_noise_rejects_invalid_shape(): + invocation = SD3DenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = torch.zeros(1, 8, 8, 8) + + with pytest.raises(ValueError, match="Expected noise with shape"): + invocation._prepare_noise_tensor(mock_context, 16, torch.bfloat16, torch.device("cpu")) + + +def test_cogview4_prepare_noise_uses_external_noise(): + invocation = CogView4DenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + expected = torch.zeros(1, 16, 8, 8) + mock_context.tensors.load.return_value = expected + + with patch.object(invocation, "_get_noise") as mock_get_noise: + noise = invocation._prepare_noise_tensor(mock_context, 16, torch.bfloat16, torch.device("cpu")) + + assert torch.equal(noise, expected.to(dtype=torch.bfloat16)) + mock_get_noise.assert_not_called() + + +def test_cogview4_prepare_noise_rejects_invalid_shape(): + invocation = CogView4DenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = torch.zeros(1, 4, 8, 8) + + with pytest.raises(ValueError, match="Expected noise with shape"): + invocation._prepare_noise_tensor(mock_context, 16, torch.bfloat16, torch.device("cpu")) + + +def test_z_image_prepare_noise_uses_external_noise(): + invocation = ZImageDenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + expected = torch.zeros(1, 16, 8, 8) + mock_context.tensors.load.return_value = expected + + with patch.object(invocation, "_get_noise") as mock_get_noise: + noise = invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + assert torch.equal(noise, expected.to(dtype=torch.bfloat16)) + mock_get_noise.assert_not_called() + + +def test_z_image_prepare_noise_rejects_invalid_shape(): + invocation = ZImageDenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = torch.zeros(1, 8, 8, 8) + + with pytest.raises(ValueError, match="Expected noise with shape"): + invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + +def test_z_image_add_noise_false_ignores_connected_noise(): + invocation = ZImageDenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + noise=MagicMock(latents_name="noise"), + add_noise=False, + width=64, + height=64, + steps=4, + denoising_start=0.0, + denoising_end=1.0, + positive_conditioning=SimpleNamespace(conditioning_name="positive", mask=None), + transformer=MagicMock(transformer="transformer"), + seed=123, + scheduler="euler", + ) + init_latents = torch.full((1, 16, 8, 8), 2.0) + dummy_conditioning = SimpleNamespace(prompt_embeds=torch.zeros(1, 4, 16)) + dummy_conditioning.to = lambda **_: dummy_conditioning + regional_extension = SimpleNamespace( + regional_text_conditioning=SimpleNamespace(prompt_embeds=torch.zeros(1, 4, 16)) + ) + loaded_text_conditioning = [SimpleNamespace(prompt_embeds=torch.zeros(1, 4, 16), mask=None)] + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + mock_context.conditioning.load.return_value = SimpleNamespace(conditionings=[dummy_conditioning]) + + with ( + patch( + "invokeai.app.invocations.z_image_denoise.TorchDevice.choose_torch_device", return_value=torch.device("cpu") + ), + patch( + "invokeai.app.invocations.z_image_denoise.TorchDevice.choose_bfloat16_safe_dtype", + return_value=torch.bfloat16, + ), + patch("invokeai.app.invocations.z_image_denoise.ZImageConditioningInfo", object), + patch( + "invokeai.app.invocations.z_image_denoise.ZImageRegionalPromptingExtension.from_text_conditionings", + return_value=regional_extension, + ), + patch.object(invocation, "_load_text_conditioning", return_value=loaded_text_conditioning), + patch.object(invocation, "_prepare_noise_tensor", side_effect=AssertionError("noise should be ignored")), + patch.object(invocation, "_get_sigmas", return_value=[0.75]), + ): + result = invocation._run_diffusion(mock_context) + + assert torch.equal(result, init_latents) + + +def test_anima_prepare_noise_uses_external_noise(): + invocation = AnimaDenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + expected = torch.zeros(1, 16, 1, 8, 8) + mock_context.tensors.load.return_value = expected + + with patch.object(invocation, "_get_noise") as mock_get_noise: + noise = invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + assert torch.equal(noise, expected.to(dtype=torch.bfloat16)) + mock_get_noise.assert_not_called() + + +def test_anima_prepare_noise_rejects_invalid_rank(): + invocation = AnimaDenoiseInvocation.model_construct( + width=64, height=64, seed=0, noise=MagicMock(latents_name="noise") + ) + mock_context = MagicMock() + mock_context.tensors.load.return_value = torch.zeros(1, 16, 8, 8) + + with pytest.raises(ValueError, match="Expected noise with shape"): + invocation._prepare_noise_tensor(mock_context, torch.bfloat16, torch.device("cpu")) + + +def test_anima_add_noise_false_ignores_connected_noise(): + invocation = AnimaDenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + noise=MagicMock(latents_name="noise"), + add_noise=False, + width=64, + height=64, + steps=4, + denoising_start=0.0, + denoising_end=1.0, + positive_conditioning=SimpleNamespace(conditioning_name="positive", mask=None), + transformer=MagicMock(transformer="transformer"), + seed=123, + scheduler="euler", + ) + init_latents = torch.full((1, 16, 8, 8), 2.0) + loaded_text_conditioning = [SimpleNamespace(mask=None)] + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + mock_context.models.load.return_value = MagicMock() + + with ( + patch( + "invokeai.app.invocations.anima_denoise.TorchDevice.choose_torch_device", return_value=torch.device("cpu") + ), + patch( + "invokeai.app.invocations.anima_denoise.TorchDevice.choose_bfloat16_safe_dtype", return_value=torch.bfloat16 + ), + patch.object(invocation, "_load_text_conditionings", return_value=loaded_text_conditioning), + patch.object(invocation, "_prepare_noise_tensor", side_effect=AssertionError("noise should be ignored")), + patch.object(invocation, "_get_sigmas", return_value=[0.75]), + ): + result = invocation._run_diffusion(mock_context) + + assert torch.equal(result, init_latents) + + +def test_flux2_add_noise_false_ignores_connected_noise(): + invocation = Flux2DenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + noise=MagicMock(latents_name="noise"), + add_noise=False, + width=64, + height=64, + num_steps=4, + denoising_start=0.25, + denoising_end=0.25, + positive_text_conditioning=MagicMock(conditioning_name="positive"), + transformer=MagicMock(transformer="transformer"), + vae=MagicMock(vae="vae"), + seed=123, + ) + init_latents = torch.full((1, 32, 8, 8), 2.0) + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + mock_context.conditioning.load.return_value = SimpleNamespace( + conditionings=[ + SimpleNamespace( + t5_embeds=torch.zeros(1, 4, 16), to=lambda **_: SimpleNamespace(t5_embeds=torch.zeros(1, 4, 16)) + ) + ] + ) + mock_context.models.get_config.return_value = SimpleNamespace(base=BaseModelType.Flux2, type=ModelType.Main) + + with ( + patch( + "invokeai.app.invocations.flux2_denoise.TorchDevice.choose_torch_device", return_value=torch.device("cpu") + ), + patch("invokeai.app.invocations.flux2_denoise.FLUXConditioningInfo", object), + patch.object(invocation, "_get_bn_stats", return_value=None), + patch.object(invocation, "_prepare_noise_tensor", side_effect=AssertionError("noise should be ignored")), + ): + result = invocation._run_diffusion(mock_context) + + assert torch.equal(result, init_latents) + + +def test_flux_metadata_ignores_external_noise_seed_when_noise_not_used(): + invocation = FluxDenoiseLatentsMetaInvocation.model_construct( + width=64, + height=64, + num_steps=4, + guidance=3.5, + denoising_start=0.0, + denoising_end=1.0, + latents=MagicMock(latents_name="latents"), + transformer=MagicMock(transformer="transformer", loras=[]), + noise=MagicMock(seed=123), + seed=999, + add_noise=False, + ) + mock_context = MagicMock() + output = LatentsOutput.build("latents", torch.zeros(1, 16, 8, 8), seed=None) + + with patch("invokeai.app.invocations.metadata_linked.FluxDenoiseInvocation.invoke", return_value=output): + result = invocation.invoke(mock_context) + + assert result.metadata.root["seed"] == 999 + + +def test_z_image_metadata_ignores_external_noise_seed_when_noise_not_used(): + invocation = ZImageDenoiseMetaInvocation.model_construct( + width=64, + height=64, + steps=8, + guidance_scale=1.0, + denoising_start=0.0, + denoising_end=1.0, + scheduler="euler", + latents=MagicMock(latents_name="latents"), + transformer=MagicMock(transformer="transformer", loras=[]), + noise=MagicMock(seed=123), + seed=999, + add_noise=False, + ) + mock_context = MagicMock() + output = LatentsOutput.build("latents", torch.zeros(1, 16, 8, 8), seed=None) + + with patch("invokeai.app.invocations.metadata_linked.ZImageDenoiseInvocation.invoke", return_value=output): + result = invocation.invoke(mock_context) + + assert result.metadata.root["seed"] == 999 + + +def _get_first_scheduler_sigma( + scheduler, *, scheduler_name: str, sigmas: list[float], mu: float | None = None +) -> float: + set_timesteps_signature = inspect.signature(scheduler.set_timesteps) + if scheduler_name != "lcm" and "sigmas" in set_timesteps_signature.parameters: + kwargs: dict[str, object] = {"sigmas": sigmas, "device": "cpu"} + if mu is not None and "mu" in set_timesteps_signature.parameters: + kwargs["mu"] = mu + scheduler.set_timesteps(**kwargs) + else: + kwargs = {"num_inference_steps": len(sigmas) - 1, "device": "cpu"} + if mu is not None and "mu" in set_timesteps_signature.parameters: + kwargs["mu"] = mu + scheduler.set_timesteps(**kwargs) + return float(scheduler.sigmas[0]) + + +@pytest.mark.parametrize( + "scheduler_name", + [ + "euler", + pytest.param( + "heun", + marks=pytest.mark.xfail( + reason="Known img2img preblend mismatch for FLUX with scheduler-defined first step.", + strict=True, + ), + ), + pytest.param( + "lcm", + marks=pytest.mark.xfail( + reason="Known img2img preblend mismatch for FLUX with scheduler-defined first step.", + strict=True, + ), + ), + ], +) +def test_flux_img2img_preblend_matches_scheduler_first_sigma(scheduler_name: str): + sigmas = clip_timestep_schedule_fractional(get_schedule(num_steps=4, image_seq_len=16, shift=True), 0.25, 1.0) + scheduler_class = FLUX_SCHEDULER_MAP[scheduler_name] + scheduler = scheduler_class(num_train_timesteps=1000) + + assert sigmas[0] == pytest.approx( + _get_first_scheduler_sigma(scheduler, scheduler_name=scheduler_name, sigmas=sigmas) + ) + + +def test_flux2_partial_denoise_short_circuit_uses_first_clipped_timestep(): + invocation = Flux2DenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + width=64, + height=64, + num_steps=4, + denoising_start=0.25, + denoising_end=0.25, + positive_text_conditioning=MagicMock(conditioning_name="positive"), + transformer=MagicMock(transformer="transformer"), + vae=MagicMock(vae="vae"), + seed=0, + scheduler="lcm", + ) + init_latents = torch.full((1, 32, 8, 8), 2.0) + noise = torch.full((1, 32, 8, 8), 10.0) + dummy_conditioning = SimpleNamespace(t5_embeds=torch.zeros(1, 4, 16)) + dummy_conditioning.to = lambda **_: dummy_conditioning + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + mock_context.conditioning.load.return_value = SimpleNamespace(conditionings=[dummy_conditioning]) + mock_context.models.get_config.return_value = SimpleNamespace(base=BaseModelType.Flux2, type=ModelType.Main) + + with ( + patch( + "invokeai.app.invocations.flux2_denoise.TorchDevice.choose_torch_device", return_value=torch.device("cpu") + ), + patch("invokeai.app.invocations.flux2_denoise.FLUXConditioningInfo", object), + patch.object(invocation, "_get_bn_stats", return_value=None), + patch.object(invocation, "_prepare_noise_tensor", return_value=noise), + ): + result = invocation._run_diffusion(mock_context) + + timesteps = clip_timestep_schedule_fractional(get_schedule_flux2(num_steps=4, image_seq_len=16), 0.25, 0.25) + expected = timesteps[0] * noise + (1.0 - timesteps[0]) * init_latents + assert torch.equal(result, expected) + + +def test_flux2_lcm_scheduler_setup_passes_mu(): + from invokeai.backend.flux2.denoise import denoise + + class DummyScheduler: + def __init__(self) -> None: + self.received_mu = None + self.timesteps = torch.tensor([750.0, 500.0], dtype=torch.float32) + self.sigmas = torch.tensor([0.75, 0.5, 0.0], dtype=torch.float32) + self.config = SimpleNamespace(num_train_timesteps=1000) + + def set_timesteps(self, num_inference_steps: int, device: str | torch.device, mu: float | None = None) -> None: + self.received_mu = mu + + def step(self, model_output: torch.Tensor, timestep: torch.Tensor, sample: torch.Tensor): + return SimpleNamespace(prev_sample=sample) + + class DummyModel(torch.nn.Module): + def forward( + self, + hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor, + timestep: torch.Tensor, + img_ids: torch.Tensor, + txt_ids: torch.Tensor, + guidance: torch.Tensor, + return_dict: bool = False, + ): + return (torch.zeros_like(hidden_states),) + + scheduler = DummyScheduler() + denoise( + model=DummyModel(), + img=torch.zeros(1, 4, 8), + img_ids=torch.zeros(1, 4, 4, dtype=torch.long), + txt=torch.zeros(1, 4, 8), + txt_ids=torch.zeros(1, 4, 4, dtype=torch.long), + timesteps=[0.75, 0.5, 0.0], + step_callback=lambda _: None, + guidance=1.0, + cfg_scale=[1.0, 1.0], + scheduler=scheduler, + mu=0.42, + ) + + assert scheduler.received_mu == pytest.approx(0.42) + + +@pytest.mark.parametrize( + "scheduler_name", + [ + "euler", + pytest.param( + "heun", + marks=pytest.mark.xfail( + reason="Known img2img preblend mismatch for Z-Image with scheduler-defined first step.", + strict=True, + ), + ), + pytest.param( + "lcm", + marks=pytest.mark.xfail( + reason="Known img2img preblend mismatch for Z-Image with scheduler-defined first step.", + strict=True, + ), + ), + ], +) +def test_z_image_img2img_preblend_matches_scheduler_first_sigma(scheduler_name: str): + invocation = ZImageDenoiseInvocation.model_construct(steps=8, width=1024, height=1024) + img_seq_len = (invocation.height // 8 // 2) * (invocation.width // 8 // 2) + shift = invocation._calculate_shift(img_seq_len) + sigmas = invocation._get_sigmas(shift, invocation.steps) + sigmas = sigmas[int(0.25 * (len(sigmas) - 1)) :] + scheduler_class = ZIMAGE_SCHEDULER_MAP[scheduler_name] + scheduler = scheduler_class(num_train_timesteps=1000, shift=1.0) + + assert sigmas[0] == pytest.approx( + _get_first_scheduler_sigma(scheduler, scheduler_name=scheduler_name, sigmas=sigmas) + ) + + +@pytest.mark.parametrize( + "scheduler_name", + [ + "euler", + pytest.param( + "heun", + marks=pytest.mark.xfail( + reason="Known img2img preblend mismatch for Anima with scheduler-defined first step.", + strict=True, + ), + ), + pytest.param( + "lcm", + marks=pytest.mark.xfail( + reason="Known img2img preblend mismatch for Anima with scheduler-defined first step.", + strict=True, + ), + ), + ], +) +def test_anima_img2img_preblend_matches_scheduler_first_sigma(scheduler_name: str): + invocation = AnimaDenoiseInvocation.model_construct(steps=30) + sigmas = invocation._get_sigmas(invocation.steps) + sigmas = sigmas[int(0.25 * (len(sigmas) - 1)) :] + scheduler_class, scheduler_kwargs = ANIMA_SCHEDULER_MAP[scheduler_name] + scheduler = scheduler_class(num_train_timesteps=1000, **scheduler_kwargs) + + assert sigmas[0] == pytest.approx( + _get_first_scheduler_sigma(scheduler, scheduler_name=scheduler_name, sigmas=sigmas) + ) + + +def test_sd3_partial_denoise_short_circuit_uses_first_clipped_timestep(): + invocation = SD3DenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + width=64, + height=64, + steps=4, + denoising_start=0.25, + denoising_end=0.25, + positive_conditioning=MagicMock(conditioning_name="positive"), + negative_conditioning=MagicMock(conditioning_name="negative"), + transformer=MagicMock(transformer="transformer"), + seed=0, + ) + init_latents = torch.full((1, 16, 8, 8), 2.0) + noise = torch.full((1, 16, 8, 8), 10.0) + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + mock_context.models.load.return_value = MagicMock( + model=MagicMock(config=MagicMock(in_channels=16, joint_attention_dim=4096)) + ) + + with ( + patch("invokeai.app.invocations.sd3_denoise.TorchDevice.choose_torch_device", return_value=torch.device("cpu")), + patch("invokeai.app.invocations.sd3_denoise.TorchDevice.choose_torch_dtype", return_value=torch.float32), + patch.object(invocation, "_prepare_noise_tensor", return_value=noise), + patch.object(invocation, "_load_text_conditioning", return_value=(torch.zeros(1, 1, 1), torch.zeros(1, 1))), + ): + result = invocation._run_diffusion(mock_context) + + timesteps = clip_timestep_schedule_fractional(torch.linspace(1, 0, invocation.steps + 1).tolist(), 0.25, 0.25) + expected = timesteps[0] * noise + (1.0 - timesteps[0]) * init_latents + assert torch.equal(result, expected) + + +def test_cogview4_partial_denoise_short_circuit_uses_first_clipped_sigma(): + invocation = CogView4DenoiseInvocation.model_construct( + latents=MagicMock(latents_name="latents"), + width=64, + height=64, + steps=4, + denoising_start=0.25, + denoising_end=0.25, + positive_conditioning=MagicMock(conditioning_name="positive"), + negative_conditioning=MagicMock(conditioning_name="negative"), + transformer=MagicMock(transformer="transformer"), + seed=0, + ) + init_latents = torch.full((1, 16, 8, 8), 2.0) + noise = torch.full((1, 16, 8, 8), 10.0) + mock_context = MagicMock() + mock_context.tensors.load.return_value = init_latents + transformer_model = MagicMock(config=MagicMock(in_channels=16, patch_size=2)) + mock_context.models.load.return_value = MagicMock(model=transformer_model) + + with ( + patch("invokeai.app.invocations.cogview4_denoise.CogView4Transformer2DModel", object), + patch( + "invokeai.app.invocations.cogview4_denoise.TorchDevice.choose_torch_device", + return_value=torch.device("cpu"), + ), + patch.object(invocation, "_prepare_noise_tensor", return_value=noise), + patch.object(invocation, "_load_text_conditioning", return_value=torch.zeros(1, 1, 1)), + ): + result = invocation._run_diffusion(mock_context) + + timesteps = clip_timestep_schedule_fractional(torch.linspace(1, 0, invocation.steps + 1).tolist(), 0.25, 0.25) + sigmas = invocation._convert_timesteps_to_sigmas( + image_seq_len=((invocation.height // 8) * (invocation.width // 8)) // (2**2), + timesteps=torch.tensor(timesteps), + ) + expected = sigmas[0] * noise + (1.0 - sigmas[0]) * init_latents + assert torch.allclose(result, expected, atol=2e-3, rtol=0) diff --git a/tests/app/invocations/test_latent_noise.py b/tests/app/invocations/test_latent_noise.py new file mode 100644 index 00000000000..16151513e12 --- /dev/null +++ b/tests/app/invocations/test_latent_noise.py @@ -0,0 +1,101 @@ +from unittest.mock import MagicMock + +import pytest +import torch + + +@pytest.mark.parametrize( + ("noise_type", "width", "height", "expected_shape"), + [ + ("SD", 64, 64, (1, 4, 8, 8)), + ("FLUX", 64, 64, (1, 16, 8, 8)), + ("FLUX.2", 64, 64, (1, 32, 8, 8)), + ("SD3", 64, 64, (1, 16, 8, 8)), + ("CogView4", 64, 64, (1, 16, 8, 8)), + ("Z-Image", 64, 64, (1, 16, 8, 8)), + ("Anima", 64, 64, (1, 16, 1, 8, 8)), + ], +) +def test_noise_invocation_generates_expected_shapes(noise_type: str, width: int, height: int, expected_shape): + from invokeai.app.invocations.noise import NoiseInvocation + + mock_context = MagicMock() + mock_context.tensors.save.return_value = "noise-name" + + invocation = NoiseInvocation(noise_type=noise_type, width=width, height=height, seed=123) + + output = invocation.invoke(mock_context) + + saved_tensor = mock_context.tensors.save.call_args.kwargs["tensor"] + assert saved_tensor.shape == expected_shape + assert output.noise.seed == 123 + assert output.width == width + assert output.height == height + + +def test_noise_invocation_defaults_to_sd_shape(): + from invokeai.app.invocations.noise import NoiseInvocation + + mock_context = MagicMock() + mock_context.tensors.save.return_value = "noise-name" + + invocation = NoiseInvocation(width=64, height=64, seed=1) + + invocation.invoke(mock_context) + + saved_tensor = mock_context.tensors.save.call_args.kwargs["tensor"] + assert saved_tensor.shape == (1, 4, 8, 8) + + +@pytest.mark.parametrize( + ("noise_type", "width", "height", "message"), + [ + ("SD", 66, 64, "multiple of 8"), + ("FLUX", 72, 64, "multiple of 16"), + ("FLUX.2", 64, 72, "multiple of 16"), + ("SD3", 72, 64, "multiple of 16"), + ("Z-Image", 64, 72, "multiple of 16"), + ("CogView4", 64, 80, "multiple of 32"), + ("Anima", 66, 64, "multiple of 8"), + ], +) +def test_noise_invocation_rejects_invalid_dimensions(noise_type: str, width: int, height: int, message: str): + from invokeai.app.invocations.noise import NoiseInvocation + + mock_context = MagicMock() + + with pytest.raises(ValueError, match=message): + invocation = NoiseInvocation(noise_type=noise_type, width=width, height=height, seed=0) + invocation.invoke(mock_context) + + +def test_noise_invocation_is_deterministic_for_identical_inputs(): + from invokeai.app.invocations.noise import NoiseInvocation + + mock_context = MagicMock() + mock_context.tensors.save.side_effect = ["noise-1", "noise-2"] + + invocation = NoiseInvocation(noise_type="FLUX", width=64, height=64, seed=7) + + invocation.invoke(mock_context) + first = mock_context.tensors.save.call_args_list[0].kwargs["tensor"] + invocation.invoke(mock_context) + second = mock_context.tensors.save.call_args_list[1].kwargs["tensor"] + assert torch.equal(first, second) + + +@pytest.mark.parametrize(("noise_type", "expected_shape"), [("FLUX", (1, 16, 8, 8)), ("FLUX.2", (1, 32, 8, 8))]) +def test_generate_noise_tensor_honors_use_cpu_false_for_flux_variants(noise_type: str, expected_shape): + from invokeai.app.invocations.latent_noise import generate_noise_tensor + + noise = generate_noise_tensor( + noise_type=noise_type, + width=64, + height=64, + seed=0, + device=torch.device("cpu"), + dtype=torch.float32, + use_cpu=False, + ) + + assert noise.shape == expected_shape