From 111ec2ff94f192ffe37d5b370bdd196da69facbc Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 13 May 2026 16:37:47 -0700
Subject: [PATCH 01/95] [AMDGPU] Fix conflicted literal test. NFC. (#197587)

---
 llvm/test/MC/AMDGPU/literals.s | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index eb34ea0316945..273ed630e104f 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -1986,14 +1986,12 @@ v_add_f64 v[0:1], v[0:1], lit(1)
 // GFX89: v_add_f64 v[0:1], v[0:1], lit(0x1)      ; encoding: [0x00,0x00,0x80,0xd2,0x00,0xff,0x01,0x00]
 // SICI: v_add_f64 v[0:1], v[0:1], lit(0x1)      ; encoding: [0x00,0x00,0xc8,0xd2,0x00,0xff,0x01,0x00]
 
-// FIXME: Forced lit() encoding is not preserved after disasm
 v_add_f64 v[0:1], v[0:1], lit(1.0)
-// GFX11: v_add_f64 v[0:1], v[0:1], lit(0x3ff00000)      ; encoding: [0x00,0x00,0x27,0xd7,0x00,0xff,0x01,0x02,0x00,0x00,0xf0,0x3f]
-// GFX12: v_add_f64_e64 v[0:1], v[0:1], lit(0x3ff00000)  ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x00,0x00,0xf0,0x3f]
-// GFX1250-ASM: v_add_f64_e64 v[0:1], v[0:1], lit(0x3ff00000) ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x00,0x00,0xf0,0x3f]
-// GFX1250-DIS: v_add_f64_e64 v[0:1], v[0:1], 1.0       ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xe5,0x01,0x02]
-// NOGFX89: :[[@LINE-5]]:31: error: literal operands are not supported
-// NOSICI: :[[@LINE-6]]:31: error: literal operands are not supported
+// NOGFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:31: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:31: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:31: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:31: error: invalid operand for instruction
 
 v_add_f64 v[0:1], v[0:1], lit64(1.0)
 // NOGFX11: :[[@LINE-1]]:27: error: lit64 is not supported on this GPU

From 1bb237afc64740b177c61da0a5a16166de2db948 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <jdoerfert.llvm@gmail.com>
Date: Wed, 13 May 2026 16:41:09 -0700
Subject: [PATCH 02/95] [Instrumentor][NFC] Add docs and config-wizard script
 (#197066)

This commit adds initial documentation for the instrumentor to the
html/man pages and provides a script that helps new users to setup the
config and stubs file interactively.

The script and docs have been created with Claude (AI) but
proofread/tested and modified afterwards.
---
 llvm/docs/Instrumentor.rst               | 786 +++++++++++++++++++++
 llvm/docs/UserGuides.rst                 |   5 +
 llvm/utils/instrumentor-config-wizard.py | 834 +++++++++++++++++++++++
 3 files changed, 1625 insertions(+)
 create mode 100644 llvm/docs/Instrumentor.rst
 create mode 100755 llvm/utils/instrumentor-config-wizard.py

diff --git a/llvm/docs/Instrumentor.rst b/llvm/docs/Instrumentor.rst
new file mode 100644
index 0000000000000..cbabfcde6b37d
--- /dev/null
+++ b/llvm/docs/Instrumentor.rst
@@ -0,0 +1,786 @@
+==================================
+The LLVM Instrumentor Pass
+==================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+The **Instrumentor** is a highly configurable instrumentation pass for LLVM-IR
+that allows users to insert custom runtime function calls at various program
+points. Unlike traditional instrumentation tools that are hardcoded for
+specific purposes (like sanitizers or profilers), the Instrumentor provides a
+flexible, configuration-driven approach where users can specify:
+
+- **What** to instrument (loads, stores, allocations, function calls, etc.)
+- **Where** to instrument (before or after operations)
+- **What information** to pass to the runtime (pointers, values, sizes, types, etc.)
+- **Whether** to modify program behavior (replace values, redirect pointers, etc.)
+
+The Instrumentor is designed to support a wide variety of use cases including:
+
+- Custom memory profilers and trackers
+- Performance analysis tools
+- Dynamic program analysis
+- Debugging and tracing utilities
+- Stack usage monitoring
+- Custom sanitizers and checkers
+
+To use the Instrumentor it is recommended to run the wizard script located at
+`./llvm/utils/instrumentor-config-wizard.py`. The script will interactively
+create a configuration file and a stub runtime which is required to be linked
+into the instrumented program.
+
+Key Features
+============
+
+Configurable Instrumentation Opportunities
+-------------------------------------------
+
+The Instrumentor supports instrumentation at multiple levels:
+
+**Instruction-level:**
+  - **Load instructions**: Instrument memory reads with access to pointer, loaded value, alignment, size, atomicity, etc.
+  - **Store instructions**: Instrument memory writes with access to pointer, stored value, alignment, size, atomicity, etc.
+  - **Alloca instructions**: Instrument stack allocations with access to size, alignment, and allocated address
+
+**Function-level:**
+  - **Function entry**: Instrument at function start with access to function name, address, arguments, etc.
+  - **Function exit**: Instrument at function return
+
+**Future extensions:**
+  - Basic block entry/exit
+  - Module-level initialization
+  - Global variable access
+
+PRE and POST Instrumentation
+-----------------------------
+
+Each instrumentation opportunity supports two positions:
+
+- **PRE**: Insert instrumentation **before** the operation occurs
+
+  - For loads: can inspect/modify the pointer before reading
+  - For stores: can inspect/modify the pointer and value before writing
+  - For allocas: can modify the allocation size
+  - For functions: instrument at function entry, inspect/replace arguments
+
+- **POST**: Insert instrumentation **after** the operation occurs
+
+  - For loads: can inspect/modify the loaded value
+  - For stores: instrument after the write completes
+  - For allocas: can inspect/modify the allocated address
+  - For functions: instrument at function exit
+
+Selective Argument Passing
+---------------------------
+
+For each instrumentation opportunity, users can individually enable/disable specific arguments to control:
+
+- What information is passed to the runtime function
+- The signature of the generated runtime function
+- Performance overhead (fewer arguments = faster calls)
+
+For example, for load instrumentation, you can choose to pass:
+
+- Pointer address
+- Pointer address space
+- Loaded value
+- Value size
+- Alignment
+- Value type ID
+- Atomicity ordering
+- Synchronization scope
+- Volatility flag
+- Unique instrumentation ID
+
+Value Replacement
+-----------------
+
+The Instrumentor supports **replacing** values returned from the runtime:
+
+- **Load replacement**: The runtime can provide a different value than what was loaded from memory
+- **Store replacement**: The runtime can modify the pointer or value being stored
+- **Alloca replacement**: The runtime can provide a different allocation size or replace the allocated address
+- **Argument replacement**: The runtime can modify the arguments passed to a function
+
+This enables use cases like:
+
+- Value redirection for debugging
+- Custom memory allocators
+- Fault injection
+- Taint tracking
+
+Instrumentation Filtering
+----------------
+
+The Instrumentor provides fine-grained control over what gets instrumented:
+
+- **Target regex**: Match against the target triple (e.g., ``x86_64-.*-linux``)
+- **Host/GPU toggle**: Separately enable/disable CPU and GPU instrumentation
+- **Function filtering**: Exclude runtime functions from instrumentation via a regular expression
+
+Configuration System
+====================
+
+The Instrumentor uses a JSON-based configuration system that allows users to:
+
+1. Generate a default configuration showing all available options
+2. Interactively customize the configuration using the wizard
+3. Load and modify existing configurations
+4. Generate runtime stub implementations
+
+Configuration File Format
+-------------------------
+
+The configuration file is a JSON document with the following structure:
+
+.. code-block:: json
+
+   {
+     "configuration": {
+       "runtime_prefix": "__instrumentor_",
+       "target_regex": "",
+       "host_enabled": true,
+       "gpu_enabled": true
+     },
+     "function_pre": {
+       "function": {
+         "enabled": true,
+         "address": true,
+         "name": true,
+         "id": true
+       }
+     },
+     "instruction_pre": {
+       "load": {
+         "enabled": true,
+         "pointer": true,
+         "pointer.replace": false,
+         "value_size": true,
+         "id": true
+       },
+       "store": {
+         "enabled": true,
+         "pointer": true,
+         "value": true,
+         "value_size": true
+       }
+     },
+     "instruction_post": {
+       "load": {
+         "enabled": true,
+         "value": true,
+         "value.replace": false
+       }
+     }
+   }
+
+Configuration Sections
+----------------------
+
+**configuration**
+  Global settings that apply to all instrumentation:
+
+  - ``runtime_prefix``: Prefix for all runtime function names (default: ``__instrumentor_``)
+  - ``target_regex``: Regular expression to filter targets (empty = all targets)
+  - ``host_enabled``: Enable instrumentation for CPU targets (default: true)
+  - ``gpu_enabled``: Enable instrumentation for GPU targets (default: true)
+
+**function_pre / function_post**
+  Function-level instrumentation configuration.
+
+**instruction_pre / instruction_post**
+  Instruction-level instrumentation configuration, with subsections for each instruction type (``load``, ``store``, ``alloca``, etc.).
+
+Argument Configuration
+----------------------
+
+For each instrumentation opportunity, arguments are configured with:
+
+- **enabled**: Boolean to enable/disable the entire opportunity
+- **<argument_name>**: Boolean to enable/disable passing this argument
+- **<argument_name>.replace**: Boolean to enable value replacement (only for replaceable arguments)
+- **<argument_name>.description**: Human-readable description of the argument
+
+The Configuration Wizard
+=========================
+
+The Instrumentor includes an interactive configuration wizard that simplifies the process of creating and modifying configurations.
+
+Running the Wizard
+------------------
+
+.. code-block:: bash
+
+   # Run the wizard interactively
+   ./llvm/utils/instrumentor-config-wizard.py
+
+   # Specify output location
+   ./llvm/utils/instrumentor-config-wizard.py -o my_config.json
+
+   # Use specific opt binary
+   ./llvm/utils/instrumentor-config-wizard.py --opt-path /path/to/opt
+
+   # Load and modify existing configuration
+   ./llvm/utils/instrumentor-config-wizard.py --input existing.json -o modified.json
+
+Wizard Workflow
+---------------
+
+The wizard guides you through five steps:
+
+**Step 1: Select Instrumentation Types**
+  Choose which types of operations to instrument (load, store, alloca, function, etc.). This is a high-level selection - you can configure individual arguments later.
+
+**Step 2: PRE vs POST Configuration**
+  Decide whether PRE and POST instrumentation should use the same configuration or different configurations. This saves time when you want both positions to have identical settings.
+
+**Step 3: Base Configuration**
+  Configure global settings:
+
+  - Runtime prefix for function names
+  - Target regex for filtering
+  - Enable/disable host (CPU) instrumentation
+  - Enable/disable GPU instrumentation
+
+**Step 4: Configure Arguments**
+  For each enabled instrumentation type, select which arguments to pass to the runtime function. You can:
+
+  - Toggle individual arguments on/off
+  - Enable value replacement for replaceable arguments
+  - Enable all or disable all arguments
+  - Configure PRE and POST separately (if selected in Step 2)
+
+**Step 5: Review and Save**
+  Review your configuration and optionally generate runtime stub implementations. The wizard displays a summary and provides commands for using the configuration with ``opt`` and ``clang``.
+
+Generating Runtime Stubs
+-------------------------
+
+The wizard can automatically generate C stub implementations of your runtime functions:
+
+1. In Step 5, select 'g' to generate stubs
+2. Specify the output file path (default: ``<config_name>_stubs.c``)
+3. The wizard creates a C file with stub implementations that print their arguments
+
+The generated stubs are useful as:
+
+- Starting templates for implementing your runtime
+- Documentation of the expected function signatures
+- Quick prototypes for testing instrumentation
+
+Example stub output:
+
+.. code-block:: c
+
+   void __instrumentor_pre_load(void *pointer, int32_t pointer_as,
+                                 uint64_t value_size, int32_t id) {
+     printf("load pre -- pointer: %p, pointer_as: %i, "
+            "value_size: %lu, id: %i\n",
+            pointer, pointer_as, value_size, id);
+   }
+
+Usage Examples
+==============
+
+Basic Usage with opt
+--------------------
+
+**Step 1: (Optional) Generate a default configuration**
+
+.. code-block:: bash
+
+   opt -passes=instrumentor \
+       -instrumentor-write-config-file=config.json \
+       -disable-output \
+       input.ll
+
+This creates ``config.json`` with all available instrumentation opportunities and their arguments.
+
+**Step 2: Customize the configuration**
+
+Edit ``config.json`` manually or use the wizard (no input needed):
+
+.. code-block:: bash
+
+   ./llvm/utils/instrumentor-config-wizard.py --input config.json -o custom.json
+
+**Step 3: Apply instrumentation**
+
+.. code-block:: bash
+
+   opt -passes=instrumentor \
+       -instrumentor-read-config-file=custom.json \
+       input.ll -S -o instrumented.ll
+
+The instrumented output contains calls to your runtime functions at the configured program points.
+
+Using with Clang
+----------------
+
+To instrument during compilation:
+
+.. code-block:: bash
+
+   clang -mllvm -enable-instrumentor \
+         -mllvm -instrumentor-read-config-file=config.json \
+         source.c -o program
+
+Complete Workflow Example
+--------------------------
+
+Here's a complete example for creating a simple memory access profiler:
+
+**1. Create configuration with the wizard:**
+
+.. code-block:: bash
+
+   ./llvm/utils/instrumentor-config-wizard.py -o memory_profiler.json
+
+   # In the wizard:
+   # - Enable: load, store
+   # - Use same config for PRE/POST: yes
+   # - Base config: keep defaults
+   # - For load/store: enable pointer, value_size, id
+   # - Generate stubs: yes (memory_profiler_stubs.c)
+
+**2. Implement the runtime:**
+
+.. code-block:: c
+
+   // memory_runtime.c
+   #include <stdio.h>
+   #include <stdint.h>
+
+   static uint64_t load_count = 0;
+   static uint64_t store_count = 0;
+
+   void __instrumentor_pre_load(void *pointer, uint64_t value_size,
+                                  int32_t id) {
+     load_count++;
+     printf("Load from %p (size: %lu, id: %d)\n",
+            pointer, value_size, id);
+   }
+
+   void __instrumentor_pre_store(void *pointer, uint64_t value_size,
+                                   int32_t id) {
+     store_count++;
+     printf("Store to %p (size: %lu, id: %d)\n",
+            pointer, value_size, id);
+   }
+
+   __attribute__((destructor))
+   void print_stats(void) {
+     printf("Total loads: %lu\n", load_count);
+     printf("Total stores: %lu\n", store_count);
+   }
+
+**3. Instrument and compile:**
+
+.. code-block:: bash
+
+   # Instrument the program
+   clang -emit-llvm -S -o program.ll program.c
+   opt -passes=instrumentor \
+       -instrumentor-read-config-file=memory_profiler.json \
+       program.ll -S -o program_inst.ll
+
+   # Compile with runtime
+   clang program_inst.ll memory_runtime.c -o program
+
+**4. Run and observe:**
+
+.. code-block:: bash
+
+   ./program
+   # Output includes:
+   # Load from 0x7ffc12345678 (size: 4, id: 1)
+   # Store to 0x7ffc12345680 (size: 8, id: 2)
+   # ...
+   # Total loads: 42
+   # Total stores: 27
+
+Advanced Use Cases
+==================
+
+Stack Usage Profiling
+----------------------
+
+Configure alloca instrumentation to track stack allocations:
+
+.. code-block:: json
+
+   {
+     "instruction_pre": {
+       "alloca": {
+         "enabled": true,
+         "size": true,
+         "alignment": true,
+         "id": true
+       }
+     },
+     "instruction_post": {
+       "alloca": {
+         "enabled": true,
+         "address": true,
+         "size": true
+       }
+     }
+   }
+
+Runtime implementation:
+
+.. code-block:: c
+
+   static uint64_t total_stack_usage = 0;
+   static uint64_t peak_stack_usage = 0;
+   static uint64_t current_stack_usage = 0;
+
+   void __instrumentor_post_alloca(void *address, uint64_t size,
+                                     int32_t id) {
+     current_stack_usage += size;
+     total_stack_usage += size;
+     if (current_stack_usage > peak_stack_usage) {
+       peak_stack_usage = current_stack_usage;
+     }
+   }
+
+Value Replacement for Fault Injection
+--------------------------------------
+
+Use value replacement to inject faults:
+
+.. code-block:: json
+
+   {
+     "instruction_post": {
+       "load": {
+         "enabled": true,
+         "value": true,
+         "value.replace": true,
+         "pointer": true
+       }
+     }
+   }
+
+Runtime implementation:
+
+.. code-block:: c
+
+   // Replace every 1000th loaded value with zero
+   static uint64_t load_counter = 0;
+
+   uint64_t __instrumentor_post_load(uint64_t value, void *pointer) {
+     if (++load_counter % 1000 == 0) {
+       printf("Injecting fault at %p\n", pointer);
+       return 0;  // Return fault value
+     }
+     return value;  // Return original value
+   }
+
+Function-Level Tracing
+----------------------
+
+Instrument function entry and exit:
+
+.. code-block:: json
+
+   {
+     "function_pre": {
+       "function": {
+         "enabled": true,
+         "name": true,
+         "address": true,
+         "num_arguments": true
+       }
+     },
+     "function_post": {
+       "function": {
+         "enabled": true,
+         "name": true
+       }
+     }
+   }
+
+Runtime implementation:
+
+.. code-block:: c
+
+   static int call_depth = 0;
+
+   void __instrumentor_pre_function(char *name, void *address,
+                                      int32_t num_args, int32_t id) {
+     printf("%*sEntering %s (%p) with %d args\n",
+            call_depth * 2, "", name, address, num_args);
+     call_depth++;
+   }
+
+   void __instrumentor_post_function(char *name, int32_t id) {
+     call_depth--;
+     printf("%*sExiting %s\n", call_depth * 2, "", name);
+   }
+
+GPU Instrumentation
+-------------------
+
+The Instrumentor supports GPU targets (AMDGPU and NVPTX). Configure GPU-specific instrumentation:
+
+.. code-block:: json
+
+   {
+     "configuration": {
+       "runtime_prefix": "__gpu_runtime_",
+       "target_regex": "(amdgcn|nvptx).*",
+       "host_enabled": false,
+       "gpu_enabled": true
+     },
+     "instruction_pre": {
+       "load": {
+         "enabled": true,
+         "pointer": true,
+         "pointer_as": true
+       }
+     }
+   }
+
+Note that GPU runtime functions must be implemented with appropriate device attributes.
+
+Implementation Details
+======================
+
+Generated Runtime Function Signatures
+--------------------------------------
+
+The Instrumentor generates runtime function names following this pattern:
+
+.. code-block:: text
+
+   <runtime_prefix><position>_<opportunity_name>[_ind]
+
+Where:
+
+- ``<runtime_prefix>``: Configurable prefix (default: ``__instrumentor_``)
+- ``<position>``: Either ``pre`` or ``post``
+- ``<opportunity_name>``: Name of the instrumentation opportunity (``load``, ``store``, ``function``, etc.)
+- ``_ind``: Optional suffix when indirection is used (see below)
+
+Examples:
+
+- ``__instrumentor_pre_load``
+- ``__instrumentor_post_store``
+- ``__instrumentor_pre_function``
+- ``__instrumentor_pre_load_ind`` (with indirection)
+
+Direct vs Indirect Arguments
+-----------------------------
+
+The Instrumentor uses two modes for passing arguments:
+
+**Direct mode** (default):
+  Arguments are passed by value. This is efficient but requires that all arguments fit in registers or can be passed through the stack efficiently.
+
+**Indirect mode**:
+  Arguments are passed by pointer. This is used automatically when:
+
+  - Multiple replaceable arguments are enabled (requires indirection for all replaceable args)
+  - An argument's value is too large (aggregate types, large values)
+
+When indirect mode is used, a separate function with the ``_ind`` suffix is generated:
+
+.. code-block:: c
+
+   // Direct mode
+   void __instrumentor_pre_load(void *pointer, uint64_t value_size);
+
+   // Indirect mode (automatically generated when needed)
+   void __instrumentor_pre_load_ind(void **pointer, uint32_t pointer_size,
+                                     void *value_size, uint32_t value_size_size);
+
+Users typically don't need to worry about this - the Instrumentor handles it automatically and the wizard-generated stubs show the correct signatures.
+
+Unique IDs
+----------
+
+When the ``id`` argument is enabled, the Instrumentor assigns a unique 32-bit integer to each instrumentation call site:
+
+- PRE positions get positive IDs (1, 2, 3, ...)
+- POST positions get negative IDs (-1, -2, -3, ...)
+- IDs are consistent across multiple runs
+
+Caching
+-------
+
+The Instrumentor caches certain argument values between PRE and POST calls when possible:
+
+- Values computed in PRE are reused in POST (e.g., pointer value)
+- This reduces overhead and ensures consistency
+
+Runtime Function Requirements
+------------------------------
+
+Runtime functions must be:
+
+- Defined with external linkage
+- Fast and non-blocking (to minimize instrumentation overhead)
+- Thread-safe if the program is multi-threaded
+
+Runtime functions **must not**:
+
+- Call back into instrumented code (to avoid infinite recursion)
+
+Performance Considerations
+==========================
+
+Overhead Factors
+----------------
+
+Instrumentation overhead depends on:
+
+1. **Number of instrumentation points**: More instrumented operations = more overhead
+2. **Number of arguments passed**: Each argument adds instructions and register pressure
+3. **Runtime function complexity**: Complex runtime logic increases overhead
+4. **Frequency of instrumented operations**: Instrumenting hot loops has high impact
+
+Optimization Tips
+-----------------
+
+**Minimize arguments:**
+  Only enable arguments you actually need. Passing fewer arguments reduces overhead.
+
+**Use PRE or POST, not both:**
+  If you only need one position, disable the other.
+
+**Target filtering:**
+  Use ``target_regex`` to instrument only specific targets or modules.
+
+**Efficient runtime:**
+  Keep runtime functions simple and fast. Consider:
+
+  - Lock-free data structures
+  - Thread-local storage
+  - Batching outputs instead of per-call I/O
+  - Sampling (instrument 1 in N calls)
+
+**Build with optimizations:**
+  Use ``-O2`` or ``-O3`` when compiling instrumented code. LLVM can optimize away some overhead.
+
+Troubleshooting
+===============
+
+Common Issues
+-------------
+
+**"Could not find 'opt' binary"**
+  The wizard can't locate the opt binary.
+
+  - Specify the path: ``--opt-path /path/to/opt``
+
+**"Indirection needed but not indicated"**
+  An argument value is too large for direct passing. The Instrumentor handles this automatically, but you might see this warning. It's usually harmless - the indirect version of the function will be generated.
+
+**Infinite recursion / stack overflow**
+  Your runtime function is calling back into instrumented code. Solutions:
+
+  - Ensure runtime functions don't trigger more instrumentation
+
+**Linking errors**
+  Runtime functions are undefined. You must:
+
+  - Implement all enabled runtime functions
+  - Link the runtime implementation with your program
+  - Use the exact function signatures (check generated stubs)
+
+**Unexpected instrumentation**
+  More instrumentation than expected. Check:
+
+  - The ``enabled`` flag for each opportunity
+  - ``host_enabled`` / ``gpu_enabled`` settings
+  - ``target_regex`` matches your target
+  - Runtime functions aren't being instrumented (they should be automatically excluded)
+
+Debugging Instrumented Code
+----------------------------
+
+**View instrumented IR:**
+
+.. code-block:: bash
+
+   opt -passes=instrumentor \
+       -instrumentor-read-config-file=config.json \
+       input.ll -S -o output.ll
+
+   # Examine output.ll to see inserted calls
+
+**Print configuration:**
+
+.. code-block:: bash
+
+   opt -passes=instrumentor \
+       -instrumentor-write-config-file=debug_config.json \
+       input.ll -disable-output
+
+   # Examine debug_config.json to see all options
+
+**Verify IR:**
+  The Instrumentor automatically verifies the module after instrumentation. If verification fails, there's a bug in the Instrumentor or the configuration is invalid.
+
+**Use debug builds:**
+  Build LLVM with assertions enabled (``-DLLVM_ENABLE_ASSERTIONS=ON``) to catch issues early.
+
+Extending the Instrumentor
+===========================
+
+The Instrumentor is designed to be extensible. To add new instrumentation opportunities:
+
+1. **Define the opportunity class** inheriting from ``InstrumentationOpportunity``
+2. **Implement getter/setter functions** for the arguments
+3. **Add initialization** to populate the opportunity with arguments
+4. **Register** the opportunity in ``InstrumentationConfig::populate()``
+5. **Add tests** in ``llvm/test/Transforms/Instrumentor/``
+
+See ``llvm/lib/Transforms/IPO/Instrumentor.cpp`` and ``llvm/include/llvm/Transforms/IPO/Instrumentor.h`` for examples (``LoadIO``, ``StoreIO``).
+
+Future instrumentation opportunities being considered:
+
+- Basic block entry/exit
+- Branch instrumentation
+- Call instructions
+- Atomic operations
+- Vector operations
+- Exception handling
+- Global variable access
+
+Reference
+=========
+
+Command-Line Options
+--------------------
+
+**-instrumentor-read-config-file=<path>**
+  Load instrumentation configuration from the specified JSON file.
+
+**-instrumentor-write-config-file=<path>**
+  Write the default instrumentation configuration to the specified JSON file (useful for generating templates).
+
+Related Passes
+--------------
+
+The Instrumentor is more flexible but related to:
+
+- **AddressSanitizer**: Specialized memory error detector
+- **ThreadSanitizer**: Race condition detector
+- **MemorySanitizer**: Uninitialized memory detector
+- **DataFlowSanitizer**: Taint tracking
+- **XRay**: Function call tracing with low overhead
+
+The Instrumentor can implement similar functionality with custom runtime code, but specialized passes may have better performance for their specific use cases.
+
+Further Reading
+---------------
+
+- Source code: ``llvm/lib/Transforms/IPO/Instrumentor.cpp``
+- Header: ``llvm/include/llvm/Transforms/IPO/Instrumentor.h``
+- Configuration wizard: ``llvm/utils/instrumentor-config-wizard.py``
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index c4b9293b39ea4..5807b63e9aec3 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -39,6 +39,7 @@ intermediate LLVM representation.
    Remarks
    SourceLevelDebugging
    HowToUpdateDebugInfo
+   Instrumentor
    InstrRefDebugInfo
    RemoveDIsDebugInfo
    KeyInstructionsDebugInfo
@@ -216,6 +217,10 @@ Optimizations
    This document specifies guidelines for contributions for InstCombine and
    related passes.
 
+:doc:`Instrumentor`
+   A comprehensive guide to the highly configurable Instrumentor pass for custom
+   program instrumentation, including the interactive configuration wizard.
+
 
 Code Generation
 ---------------
diff --git a/llvm/utils/instrumentor-config-wizard.py b/llvm/utils/instrumentor-config-wizard.py
new file mode 100755
index 0000000000000..c3599ce0c47d8
--- /dev/null
+++ b/llvm/utils/instrumentor-config-wizard.py
@@ -0,0 +1,834 @@
+#!/usr/bin/env python3
+"""
+Interactive wizard for configuring the LLVM Instrumentor pass.
+
+This script helps users create custom instrumentation configurations by:
+1. Generating a default config file using opt
+2. Presenting available instrumentation options interactively
+3. Allowing users to enable/disable specific instrumentation opportunities
+4. Saving the customized configuration to a JSON file
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import tempfile
+from typing import Dict, List, Any, Optional, Tuple
+
+
+class InstrumentorConfigWizard:
+    def __init__(self, opt_path: str = None):
+        """Initialize the wizard with the path to opt."""
+        self.opt_path = opt_path or self.find_opt()
+        self.config = {}
+        self.enabled_opportunities = set()
+        self.same_pre_post = True
+        self.navigation_stack = []
+
+    def find_opt(self) -> str:
+        """Find the opt binary in the build directory."""
+        # Try common locations relative to this script
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        repo_root = os.path.dirname(os.path.dirname(script_dir))
+
+        # Check build/bin/opt
+        opt_candidates = [
+            os.path.join(repo_root, "build", "bin", "opt"),
+            os.path.join(repo_root, "build", "Debug", "bin", "opt"),
+            os.path.join(repo_root, "build", "Release", "bin", "opt"),
+            "opt",  # Try system PATH
+        ]
+
+        for candidate in opt_candidates:
+            if os.path.exists(candidate):
+                return candidate
+            # Check if it's in PATH
+            try:
+                subprocess.run(
+                    [candidate, "--version"], capture_output=True, check=True, timeout=5
+                )
+                return candidate
+            except (
+                subprocess.CalledProcessError,
+                FileNotFoundError,
+                subprocess.TimeoutExpired,
+            ):
+                continue
+
+        raise FileNotFoundError(
+            "Could not find 'opt' binary. Please specify the path using --opt-path"
+        )
+
+    def generate_default_config(self) -> Dict[str, Any]:
+        """Generate a default configuration by running opt."""
+        print(f"Generating default configuration using: {self.opt_path}")
+
+        # Create a minimal LLVM IR module to trigger config generation
+        minimal_ir = """
+define i32 @main() {
+  %1 = alloca i32
+  store i32 0, ptr %1
+  %2 = load i32, ptr %1
+  ret i32 %2
+}
+"""
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ir_file = os.path.join(tmpdir, "input.ll")
+            config_file = os.path.join(tmpdir, "config.json")
+
+            # Write minimal IR
+            with open(ir_file, "w") as f:
+                f.write(minimal_ir)
+
+            # Run opt with instrumentor to generate config
+            try:
+                cmd = [
+                    self.opt_path,
+                    "-passes=instrumentor",
+                    f"-instrumentor-write-config-file={config_file}",
+                    "-disable-output",
+                    ir_file,
+                ]
+
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+                if result.returncode != 0:
+                    print(
+                        f"Warning: opt returned non-zero exit code: {result.returncode}"
+                    )
+                    if result.stderr:
+                        print(f"stderr: {result.stderr}")
+
+                # Read the generated config
+                if not os.path.exists(config_file):
+                    raise FileNotFoundError(
+                        f"Config file was not generated at {config_file}"
+                    )
+
+                with open(config_file, "r") as f:
+                    config = json.load(f)
+
+                print("✓ Default configuration generated successfully\n")
+                return config
+
+            except subprocess.TimeoutExpired:
+                raise RuntimeError("opt command timed out")
+            except Exception as e:
+                raise RuntimeError(f"Failed to generate config: {e}")
+
+    def clear_screen(self):
+        """Clear the terminal screen."""
+        os.system("clear" if os.name != "nt" else "cls")
+
+    def print_section_header(self, title: str):
+        """Print a formatted section header."""
+        print("\n" + "=" * 70)
+        print(f"  {title}")
+        print("=" * 70)
+
+    def print_option(self, index: int, name: str, description: str, enabled: bool):
+        """Print a formatted option."""
+        status = "[X]" if enabled else "[ ]"
+        print(f"  {index:2d}. {status} {name:30s} - {description}")
+
+    def get_user_choice(
+        self, prompt: str, valid_choices: List[str] = None, allow_back: bool = True
+    ) -> Optional[str]:
+        """Get user input with validation."""
+        while True:
+            try:
+                nav_hint = " (b=back, q=quit)" if allow_back else " (q=quit)"
+                choice = input(prompt + nav_hint + ": ").strip().lower()
+
+                if choice == "q":
+                    confirm = input("Really quit? (y/n): ").strip().lower()
+                    if confirm == "y":
+                        print("\nWizard cancelled by user.")
+                        sys.exit(0)
+                    continue
+
+                if choice == "b" and allow_back:
+                    return "BACK"
+
+                if not choice:
+                    return ""
+
+                if valid_choices:
+                    if choice in valid_choices:
+                        return choice
+                    print(f"Please enter one of: {', '.join(valid_choices)}")
+                else:
+                    return choice
+
+            except KeyboardInterrupt:
+                print("\n\nWizard interrupted by user.")
+                sys.exit(0)
+
+    def get_all_opportunity_types(self) -> List[Tuple[str, str]]:
+        """Extract all unique opportunity types from the config."""
+        opportunities = []
+        seen = set()
+
+        for location in [
+            "function_pre",
+            "function_post",
+            "instruction_pre",
+            "instruction_post",
+        ]:
+            if location not in self.config:
+                continue
+
+            for opp_name in self.config[location].keys():
+                if opp_name not in seen:
+                    seen.add(opp_name)
+                    # Get description from first occurrence
+                    opp_config = self.config[location][opp_name]
+                    desc = "No description available"
+
+                    # Try to find a description from any field
+                    for key, value in opp_config.items():
+                        if key == "enabled":
+                            continue
+                        if key.endswith(".description") and value:
+                            desc = value
+                            break
+
+                    opportunities.append((opp_name, desc))
+
+        return sorted(opportunities)
+
+    def select_opportunities(self) -> bool:
+        """Let user select which instrumentation opportunities to enable."""
+        while True:
+            self.clear_screen()
+            self.print_section_header("Step 1: Select Instrumentation Types")
+
+            opportunities = self.get_all_opportunity_types()
+
+            print("\nSelect which types of instrumentation you want to configure:")
+            print(
+                "(You can toggle individual arguments for each type in the next steps)\n"
+            )
+
+            for idx, (opp_name, opp_desc) in enumerate(opportunities, 1):
+                enabled = opp_name in self.enabled_opportunities
+                self.print_option(idx, opp_name, opp_desc, enabled)
+
+            print("\nCommands:")
+            print("  - Enter numbers (space-separated) to toggle opportunities")
+            print("  - 'all' to enable all, 'none' to disable all")
+            print("  - Press Enter when done to continue")
+
+            choice = self.get_user_choice("\nYour choice", allow_back=False)
+
+            if choice == "BACK":
+                continue
+            elif choice == "":
+                if not self.enabled_opportunities:
+                    print("\n⚠ Please enable at least one instrumentation type!")
+                    input("Press Enter to continue...")
+                    continue
+                return True
+            elif choice == "all":
+                self.enabled_opportunities = {opp[0] for opp in opportunities}
+            elif choice == "none":
+                self.enabled_opportunities.clear()
+            else:
+                try:
+                    indices = [int(x) for x in choice.split()]
+                    for idx in indices:
+                        if 1 <= idx <= len(opportunities):
+                            opp_name = opportunities[idx - 1][0]
+                            if opp_name in self.enabled_opportunities:
+                                self.enabled_opportunities.remove(opp_name)
+                            else:
+                                self.enabled_opportunities.add(opp_name)
+                except ValueError:
+                    print(
+                        "\n⚠ Invalid input. Please enter numbers separated by spaces."
+                    )
+                    input("Press Enter to continue...")
+
+    def configure_pre_post_mode(self) -> bool:
+        """Ask if PRE and POST should have the same configuration."""
+        while True:
+            self.clear_screen()
+            self.print_section_header("Step 2: PRE vs POST Configuration")
+
+            print("\nInstrumentation can happen at two points:")
+            print("  - PRE:  Before the instrumented operation")
+            print("  - POST: After the instrumented operation")
+            print("\nFor example, for a load instruction:")
+            print("  - PRE:  Can inspect/modify the pointer before the load")
+            print("  - POST: Can inspect/modify the loaded value after the load")
+
+            print(
+                f"\nCurrent mode: {'SAME configuration for PRE and POST' if self.same_pre_post else 'DIFFERENT configurations'}"
+            )
+
+            choice = self.get_user_choice(
+                "\nUse same configuration for PRE and POST? (y/n/Enter to keep)",
+                valid_choices=["y", "yes", "n", "no", ""],
+            )
+
+            if choice == "BACK":
+                return False
+            elif choice in ["y", "yes"]:
+                self.same_pre_post = True
+                return True
+            elif choice in ["n", "no"]:
+                self.same_pre_post = False
+                return True
+            elif choice == "":
+                return True
+
+    def configure_base_options(self) -> bool:
+        """Configure base/global options."""
+        while True:
+            self.clear_screen()
+            self.print_section_header("Step 3: Base Configuration")
+
+            if "configuration" not in self.config:
+                self.config["configuration"] = {}
+
+            base_config = self.config["configuration"]
+
+            # Display current settings
+            print("\nCurrent settings:")
+            print(
+                f"  1. Runtime prefix:         {base_config.get('runtime_prefix', '__instrumentor_')}"
+            )
+            print(
+                f"  2. Demangle function names: {base_config.get('demangle_function_names', True)}"
+            )
+            print(
+                f"  3. Target regex:           {base_config.get('target_regex', '(none)')}"
+            )
+            print(
+                f"  4. Host (CPU) enabled:     {base_config.get('host_enabled', True)}"
+            )
+            print(
+                f"  5. GPU enabled:            {base_config.get('gpu_enabled', True)}"
+            )
+
+            print("\nEnter option number to modify, or press Enter to continue")
+            choice = self.get_user_choice("Option")
+
+            if choice == "BACK":
+                return False
+            elif choice == "":
+                return True
+            elif choice == "1":
+                new_prefix = input("Enter runtime prefix: ").strip()
+                if new_prefix:
+                    base_config["runtime_prefix"] = new_prefix
+            elif choice == "2":
+                demangle = self.get_user_choice(
+                    "Demangle function names? (y/n)", ["y", "n"], allow_back=False
+                )
+                if demangle:
+                    base_config["demangle_function_names"] = demangle == "y"
+            elif choice == "3":
+                new_regex = input("Enter target regex (empty for none): ").strip()
+                base_config["target_regex"] = new_regex
+            elif choice == "4":
+                host = self.get_user_choice(
+                    "Enable host instrumentation? (y/n)", ["y", "n"], allow_back=False
+                )
+                if host:
+                    base_config["host_enabled"] = host == "y"
+            elif choice == "5":
+                gpu = self.get_user_choice(
+                    "Enable GPU instrumentation? (y/n)", ["y", "n"], allow_back=False
+                )
+                if gpu:
+                    base_config["gpu_enabled"] = gpu == "y"
+
+    def configure_opportunity_args(
+        self, opp_name: str, location: str, step_prefix: str = "Step 4"
+    ) -> bool:
+        """Configure arguments for a specific opportunity at a location."""
+        while True:
+            self.clear_screen()
+            location_desc = "PRE (before)" if "pre" in location else "POST (after)"
+            self.print_section_header(
+                f"{step_prefix}: Configure {opp_name} - {location_desc}"
+            )
+
+            if location not in self.config or opp_name not in self.config[location]:
+                print(f"\n⚠ {opp_name} not found in {location}")
+                input("Press Enter to continue...")
+                return True
+
+            opp_config = self.config[location][opp_name]
+
+            # Show enable/disable status
+            enabled = opp_config.get("enabled", False)
+            print(f"\nInstrumentation: {'ENABLED ✓' if enabled else 'DISABLED ✗'}")
+
+            # Collect arguments
+            args = []
+            for key, value in sorted(opp_config.items()):
+                if (
+                    key == "enabled"
+                    or key.endswith(".description")
+                    or key.endswith(".replace")
+                ):
+                    continue
+                desc = opp_config.get(f"{key}.description", "No description")
+                can_replace = f"{key}.replace" in opp_config
+                replace_enabled = (
+                    opp_config.get(f"{key}.replace", False) if can_replace else False
+                )
+                args.append((key, value, desc, can_replace, replace_enabled))
+
+            if args:
+                print("\nAvailable arguments:")
+                for idx, (
+                    arg_name,
+                    arg_enabled,
+                    arg_desc,
+                    can_replace,
+                    replace_enabled,
+                ) in enumerate(args, 1):
+                    status = "[X]" if arg_enabled else "[ ]"
+                    if can_replace:
+                        replace_status = "REPLACE" if replace_enabled else "observe"
+                        replace_mark = f" [replaceable: {replace_status}]"
+                    else:
+                        replace_mark = ""
+                    print(
+                        f"  {idx:2d}. {status} {arg_name:25s} - {arg_desc}{replace_mark}"
+                    )
+
+            print("\nCommands:")
+            print("  - 'e' to toggle enabled/disabled")
+            print("  - Enter numbers (space-separated) to toggle arguments")
+            print(
+                "  - 'r <num>' to toggle replacement for replaceable argument (e.g., 'r 1')"
+            )
+            print("  - 'all' to enable all args, 'none' to disable all args")
+            print("  - Press Enter when done")
+
+            choice = self.get_user_choice("\nYour choice")
+
+            if choice == "BACK":
+                return False
+            elif choice == "":
+                return True
+            elif choice == "e":
+                opp_config["enabled"] = not opp_config["enabled"]
+            elif choice == "all":
+                for arg_name, _, _, _, _ in args:
+                    opp_config[arg_name] = True
+            elif choice == "none":
+                for arg_name, _, _, _, _ in args:
+                    opp_config[arg_name] = False
+            elif choice.startswith("r "):
+                # Toggle replacement
+                try:
+                    parts = choice.split()
+                    if len(parts) == 2:
+                        idx = int(parts[1])
+                        if 1 <= idx <= len(args):
+                            arg_name, _, _, can_replace, _ = args[idx - 1]
+                            if can_replace:
+                                replace_key = f"{arg_name}.replace"
+                                opp_config[replace_key] = not opp_config.get(
+                                    replace_key, False
+                                )
+                            else:
+                                print(f"\n⚠ Argument '{arg_name}' is not replaceable.")
+                                input("Press Enter to continue...")
+                        else:
+                            print(f"\n⚠ Invalid argument number: {idx}")
+                            input("Press Enter to continue...")
+                    else:
+                        print("\n⚠ Usage: r <number>")
+                        input("Press Enter to continue...")
+                except ValueError:
+                    print("\n⚠ Invalid input for replacement toggle.")
+                    input("Press Enter to continue...")
+            else:
+                try:
+                    indices = [int(x) for x in choice.split()]
+                    for idx in indices:
+                        if 1 <= idx <= len(args):
+                            arg_name = args[idx - 1][0]
+                            opp_config[arg_name] = not opp_config[arg_name]
+                except ValueError:
+                    print("\n⚠ Invalid input.")
+                    input("Press Enter to continue...")
+
+    def configure_locations(self) -> bool:
+        """Configure all enabled opportunities for PRE and optionally POST."""
+        # First, disable all opportunities that are not in enabled_opportunities
+        for location in [
+            "function_pre",
+            "function_post",
+            "instruction_pre",
+            "instruction_post",
+        ]:
+            if location not in self.config:
+                continue
+            for opp_name, opp_config in self.config[location].items():
+                if opp_name not in self.enabled_opportunities:
+                    opp_config["enabled"] = False
+
+        # Configure PRE locations
+        step_num = 4
+        for idx, opp_name in enumerate(sorted(self.enabled_opportunities), 1):
+            # Try function_pre first, then instruction_pre
+            location = None
+            if (
+                "function_pre" in self.config
+                and opp_name in self.config["function_pre"]
+            ):
+                location = "function_pre"
+            elif (
+                "instruction_pre" in self.config
+                and opp_name in self.config["instruction_pre"]
+            ):
+                location = "instruction_pre"
+
+            if location:
+                if not self.configure_opportunity_args(opp_name, location):
+                    return False
+
+        # If same config, copy PRE to POST
+        if self.same_pre_post:
+            for opp_name in self.enabled_opportunities:
+                # Copy from PRE to POST
+                if (
+                    "function_pre" in self.config
+                    and opp_name in self.config["function_pre"]
+                ):
+                    if (
+                        "function_post" in self.config
+                        and opp_name in self.config["function_post"]
+                    ):
+                        pre_config = self.config["function_pre"][opp_name]
+                        post_config = self.config["function_post"][opp_name]
+                        # Copy enabled and argument settings
+                        post_config["enabled"] = pre_config.get("enabled", False)
+                        for key in pre_config:
+                            if not key.endswith(".description") and key != "enabled":
+                                if key in post_config:
+                                    post_config[key] = pre_config[key]
+
+                if (
+                    "instruction_pre" in self.config
+                    and opp_name in self.config["instruction_pre"]
+                ):
+                    if (
+                        "instruction_post" in self.config
+                        and opp_name in self.config["instruction_post"]
+                    ):
+                        pre_config = self.config["instruction_pre"][opp_name]
+                        post_config = self.config["instruction_post"][opp_name]
+                        post_config["enabled"] = pre_config.get("enabled", False)
+                        for key in pre_config:
+                            if not key.endswith(".description") and key != "enabled":
+                                if key in post_config:
+                                    post_config[key] = pre_config[key]
+        else:
+            # Configure POST locations separately
+            for opp_name in sorted(self.enabled_opportunities):
+                location = None
+                if (
+                    "function_post" in self.config
+                    and opp_name in self.config["function_post"]
+                ):
+                    location = "function_post"
+                elif (
+                    "instruction_post" in self.config
+                    and opp_name in self.config["instruction_post"]
+                ):
+                    location = "instruction_post"
+
+                if location:
+                    if not self.configure_opportunity_args(opp_name, location):
+                        return False
+
+        return True
+
+    def generate_runtime_stubs(self, config_path: str, stub_path: str) -> bool:
+        """Generate runtime stub file using the configuration."""
+        print(f"\nGenerating runtime stubs using: {self.opt_path}")
+
+        # Create a minimal LLVM IR module
+        minimal_ir = """
+define i32 @main() {
+  %1 = alloca i32
+  store i32 0, ptr %1
+  %2 = load i32, ptr %1
+  ret i32 %2
+}
+"""
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            ir_file = os.path.join(tmpdir, "input.ll")
+            temp_config = os.path.join(tmpdir, "temp_config.json")
+
+            # Write minimal IR
+            with open(ir_file, "w") as f:
+                f.write(minimal_ir)
+
+            # Create a temporary config with stub file set
+            temp_cfg = self.config.copy()
+            if "configuration" not in temp_cfg:
+                temp_cfg["configuration"] = {}
+            temp_cfg["configuration"]["runtime_stubs_file"] = stub_path
+
+            with open(temp_config, "w") as f:
+                json.dump(temp_cfg, f, indent=2)
+
+            # Run opt with instrumentor to generate stubs
+            try:
+                cmd = [
+                    self.opt_path,
+                    "-passes=instrumentor",
+                    f"-instrumentor-read-config-file={temp_config}",
+                    "-disable-output",
+                    ir_file,
+                ]
+
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+
+                if result.returncode != 0:
+                    print(
+                        f"Warning: opt returned non-zero exit code: {result.returncode}"
+                    )
+                    if result.stderr:
+                        print(f"stderr: {result.stderr}")
+
+                # Check if stub file was generated
+                if os.path.exists(stub_path):
+                    print(f"✓ Runtime stubs generated: {stub_path}")
+                    return True
+                else:
+                    print(f"✗ Stub file was not generated")
+                    return False
+
+            except subprocess.TimeoutExpired:
+                print("✗ opt command timed out")
+                return False
+            except Exception as e:
+                print(f"✗ Failed to generate stubs: {e}")
+                return False
+
+    def review_and_save(self, output_path: str) -> bool:
+        """Review configuration and save."""
+        stub_path = None
+
+        while True:
+            self.clear_screen()
+            self.print_section_header("Step 5: Review and Save")
+
+            print("\nEnabled instrumentation types:")
+            for opp in sorted(self.enabled_opportunities):
+                print(f"  ✓ {opp}")
+
+            print(
+                f"\nPRE/POST mode: {'Same configuration' if self.same_pre_post else 'Different configurations'}"
+            )
+            print(
+                f"Runtime prefix: {self.config.get('configuration', {}).get('runtime_prefix', '__instrumentor_')}"
+            )
+            print(f"\nConfiguration file: {output_path}")
+            if stub_path:
+                print(f"Runtime stubs file: {stub_path}")
+
+            print("\nCommands:")
+            print("  - 's' to save configuration and finish")
+            print("  - 'g' to generate runtime stub file (optional)")
+            print("  - 'p' to specify different output path")
+            print("  - 'b' to go back and modify settings")
+
+            choice = self.get_user_choice(
+                "\nYour choice", valid_choices=["s", "g", "p", "b", ""]
+            )
+
+            if choice == "BACK" or choice == "b":
+                return False
+            elif choice == "s":
+                try:
+                    # Remove runtime_stubs_file from config before saving
+                    config_to_save = json.loads(json.dumps(self.config))
+                    if "configuration" in config_to_save:
+                        config_to_save["configuration"].pop("runtime_stubs_file", None)
+                        config_to_save["configuration"].pop(
+                            "runtime_stubs_file.description", None
+                        )
+
+                    with open(output_path, "w") as f:
+                        json.dump(config_to_save, f, indent=2)
+                    print(f"\n✓ Configuration saved to: {output_path}")
+
+                    # Generate stubs if requested
+                    if stub_path:
+                        self.generate_runtime_stubs(output_path, stub_path)
+
+                    return True
+                except Exception as e:
+                    print(f"\n✗ Failed to save configuration: {e}")
+                    input("Press Enter to continue...")
+            elif choice == "g":
+                print("\nGenerate runtime stub file")
+                print("This creates a C/C++ file with stub implementations of the")
+                print(
+                    "instrumentation runtime functions that you can use as a template."
+                )
+
+                default_stub = output_path.rsplit(".", 1)[0] + "_stubs.c"
+                stub_input = input(
+                    f"\nStub file path (default: {default_stub}): "
+                ).strip()
+                stub_path = stub_input if stub_input else default_stub
+                print(f"Will generate stubs to: {stub_path}")
+                input("Press Enter to continue...")
+            elif choice == "p":
+                new_path = input("Enter configuration output path: ").strip()
+                if new_path:
+                    output_path = new_path
+            elif choice == "":
+                continue
+
+    def run_interactive(self, output_path: str):
+        """Run the interactive configuration wizard."""
+        self.clear_screen()
+        print("=" * 70)
+        print("  LLVM Instrumentor Configuration Wizard")
+        print("=" * 70)
+        print(
+            "\nThis wizard will help you create a custom instrumentation configuration."
+        )
+        print("You can enable/disable instrumentation opportunities and configure")
+        print("what information is passed to the runtime functions.")
+        print("\nNavigation: Use 'b' to go back, 'q' to quit at any prompt.")
+        input("\nPress Enter to continue...")
+
+        # Generate or load config
+        try:
+            self.config = self.generate_default_config()
+        except Exception as e:
+            print(f"Error: {e}")
+            return False
+
+        # State machine for navigation
+        state = 0
+        while True:
+            if state == 0:  # Select opportunities
+                if self.select_opportunities():
+                    state = 1
+            elif state == 1:  # PRE/POST mode
+                if self.configure_pre_post_mode():
+                    state = 2
+                else:
+                    state = 0
+            elif state == 2:  # Base configuration
+                if self.configure_base_options():
+                    state = 3
+                else:
+                    state = 1
+            elif state == 3:  # Configure locations
+                if self.configure_locations():
+                    state = 4
+                else:
+                    state = 2
+            elif state == 4:  # Review and save
+                if self.review_and_save(output_path):
+                    return True
+                else:
+                    state = 3
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Interactive wizard for configuring LLVM Instrumentor pass",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Interactive mode (recommended)
+  %(prog)s
+
+  # Specify custom output location
+  %(prog)s -o my_config.json
+
+  # Use specific opt binary
+  %(prog)s --opt-path /path/to/opt
+
+  # Load existing config and modify it
+  %(prog)s --input existing_config.json -o modified_config.json
+        """,
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output",
+        default="instrumentor_config.json",
+        help="Output configuration file (default: instrumentor_config.json)",
+    )
+
+    parser.add_argument(
+        "--opt-path", help="Path to the opt binary (default: auto-detect)"
+    )
+
+    parser.add_argument(
+        "--input", help="Load and modify an existing configuration file"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        wizard = InstrumentorConfigWizard(opt_path=args.opt_path)
+
+        # Load existing config if provided
+        if args.input:
+            print(f"Loading existing configuration from: {args.input}")
+            with open(args.input, "r") as f:
+                wizard.config = json.load(f)
+            print("✓ Configuration loaded\n")
+            # Extract enabled opportunities from loaded config
+            for location in [
+                "function_pre",
+                "function_post",
+                "instruction_pre",
+                "instruction_post",
+            ]:
+                if location in wizard.config:
+                    for opp_name, opp_config in wizard.config[location].items():
+                        if opp_config.get("enabled", False):
+                            wizard.enabled_opportunities.add(opp_name)
+
+        success = wizard.run_interactive(args.output)
+
+        if success:
+            print("\n" + "=" * 70)
+            print("Configuration complete!")
+            print("=" * 70)
+            print(f"\nTo use this configuration with opt:")
+            print(f"  opt -passes=instrumentor \\")
+            print(f"      -instrumentor-read-config-file={args.output} \\")
+            print(f"      input.ll -S -o output.ll")
+            print(f"\nTo use with clang:")
+            print(f"  clang -mllvm -enable-instrumentor \\")
+            print(f"        -mllvm -instrumentor-read-config-file={args.output} \\")
+            print(f"        input.c -o output")
+            return 0
+        else:
+            return 1
+
+    except Exception as e:
+        print(f"\nFatal error: {e}", file=sys.stderr)
+        import traceback
+
+        traceback.print_exc()
+        return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 882d0251d44fa2db8dd6e5817a8baa72237f77c7 Mon Sep 17 00:00:00 2001
From: Greg Clayton <gclayton@fb.com>
Date: Wed, 13 May 2026 16:46:59 -0700
Subject: [PATCH 03/95] Improve the executable name detection in ELF core
 files. (#197341)

A previous commit switched us to use the value of the AT_EXECFN, which
is an entry in the aux vector, as the executable path. As it turns out,
if a symlink is used to launch a program, the symlink path will be in
the AT_EXECFN string in core file memory. The PRPSINFO also contains a
basename of the program, and it will also be the symlink basename. The
best source of information to figure out the executable name is from the
NT_FILE note. This always has the resolved path to the executable.

Now the executable name is found in a reliable way starting with finding
the NT_FILE entry for the main executable. This can reliably be done by
finding the NT_FILE entry whose address contains the AT_PHDR aux vector
value. This value is the address of the program headers for the main
executable. If there is no NT_FILE entry we can find, we fall back to
the AT_EXECFN entry from memory and then fallback to the basename in the
PRPSINFO. This patch also creates a placeholder as the main executable
when the executable can't be found to ensure users can see which
executable they will need to track down in order to load the core file.

The tests added will test the order of precedence. It does this by
creating a core file with:
- NT_FILE entry with a path of "/path/nt_file_foo"
- AT_EXECFN in the aux vector with a path of "/path/execfn_foo"
- NT_PRPSINFO entry with a path of "prpsinfo_foo"

We then test that the correct entry is found as the best path option is
removed from the core file.
---
 .../Process/elf-core/ProcessElfCore.cpp       | 195 +++++++++++++-----
 .../Plugins/Process/elf-core/ProcessElfCore.h |  10 +-
 .../postmortem/elf-core/TestLinuxCore.py      |  53 ++++-
 .../elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml    |  29 +++
 .../elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml   |  29 +++
 .../postmortem/elf-core/elf-NT_PRPSINFO.yaml  |  18 ++
 6 files changed, 277 insertions(+), 57 deletions(-)
 create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml
 create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml
 create mode 100644 lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml

diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
index dffbdceffc9cb..d79cd7e51f1b8 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.cpp
@@ -28,6 +28,7 @@
 
 #include "Plugins/DynamicLoader/POSIX-DYLD/DynamicLoaderPOSIXDYLD.h"
 #include "Plugins/ObjectFile/ELF/ObjectFileELF.h"
+#include "Plugins/ObjectFile/Placeholder/ObjectFilePlaceholder.h"
 #include "Plugins/Process/elf-core/RegisterUtilities.h"
 #include "ProcessElfCore.h"
 #include "ThreadElfCore.h"
@@ -199,10 +200,10 @@ Status ProcessElfCore::DoLoadCore() {
   /// PT_AARCH64_MEMTAG_MTE - Contains AArch64 MTE memory tags for a range of
   ///                         Process Address Space.
   for (const elf::ELFProgramHeader &H : segments) {
-    DataExtractor data = core->GetSegmentData(H);
 
     // Parse thread contexts and auxv structure
     if (H.p_type == llvm::ELF::PT_NOTE) {
+      DataExtractor data = core->GetSegmentData(H);
       if (llvm::Error error = ParseThreadContextsFromNoteSegment(H, data))
         return Status::FromError(std::move(error));
     }
@@ -256,42 +257,43 @@ Status ProcessElfCore::DoLoadCore() {
   // the main executable using data we found in the core file notes.
   lldb::ModuleSP exe_module_sp = GetTarget().GetExecutableModule();
   if (!exe_module_sp) {
-    if (!m_nt_file_entries.empty()) {
-      std::string executable_path = GetMainExecutablePath();
-      ModuleSpec exe_module_spec;
-      exe_module_spec.GetArchitecture() = arch;
-      exe_module_spec.GetUUID() = FindModuleUUID(executable_path);
-      exe_module_spec.GetFileSpec().SetFile(executable_path,
-                                            FileSpec::Style::native);
-      if (exe_module_spec.GetFileSpec()) {
-        exe_module_sp =
-            GetTarget().GetOrCreateModule(exe_module_spec, true /* notify */);
+    ModuleSpec exe_module_spec;
+    if (GetMainExecutableModuleSpec(exe_module_spec)) {
+      exe_module_sp =
+          GetTarget().GetOrCreateModule(exe_module_spec, true /* notify */);
+      if (!exe_module_sp) {
+        // Create an ELF file from memory for the main executable. The dynamic
+        // loader requires the main executable so that it can extract the
+        // DT_DEBUG key/value pair from the dynamic section and get the list
+        // of shared libraries.
+        std::optional<NT_FILE_Entry> exe_header =
+            GetNTFileEntryForExecutableELFHeader();
+        if (exe_header) {
+          if (llvm::Expected<lldb::ModuleSP> module_sp_or_err =
+                  ReadModuleFromMemory(exe_module_spec.GetFileSpec(),
+                                       exe_header->start,
+                                       exe_header->end - exe_header->start))
+            exe_module_sp = *module_sp_or_err;
+          else
+            llvm::consumeError(module_sp_or_err.takeError());
+        }
+        // Create a placeholder module for the main executable if we failed to
+        // create an ELF module from memory.
         if (!exe_module_sp) {
-          // Create an ELF file from memory for the main executable. The dynamic
-          // loader requires the main executable so that it can extract the
-          // DT_DEBUG key/value pair from the dynamic section and get the list
-          // of shared libraries.
-          std::optional<lldb::addr_t> exe_header_addr;
-
-          // We need to find its load address
-          for (const NT_FILE_Entry &file_entry : m_nt_file_entries) {
-            if (file_entry.path == executable_path) {
-              exe_header_addr = file_entry.start;
-              break;
-            }
-          }
-          if (exe_header_addr) {
-            if (llvm::Expected<lldb::ModuleSP> module_sp_or_err =
-                    ReadModuleFromMemory(exe_module_spec.GetFileSpec(),
-                                         *exe_header_addr))
-              exe_module_sp = *module_sp_or_err;
-            else
-              llvm::consumeError(module_sp_or_err.takeError());
-          }
+          lldb::addr_t load_addr =
+              exe_header ? exe_header->start : LLDB_INVALID_ADDRESS;
+          lldb::addr_t size =
+              exe_header ? (exe_header->end - exe_header->start) : 0;
+          exe_module_sp =
+              Module::CreateModuleFromObjectFile<ObjectFilePlaceholder>(
+                  exe_module_spec, load_addr, size);
+          if (exe_module_spec.GetPlatformFileSpec())
+            exe_module_sp->SetPlatformFileSpec(
+                exe_module_spec.GetPlatformFileSpec());
         }
-        if (exe_module_sp)
-          GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo);
       }
+      if (exe_module_sp)
+        GetTarget().SetExecutableModule(exe_module_sp, eLoadDependentsNo);
     }
   }
   return error;
@@ -313,30 +315,69 @@ void ProcessElfCore::UpdateBuildIdForNTFileEntries() {
   }
 }
 
-std::string ProcessElfCore::GetMainExecutablePath() {
-  // Always try to read the program name from core file memory first via the
-  // AUXV_AT_EXECFN entry. This value is the address of a null terminated C
-  // string that contains the program path.
+/// Correctly create a FileSpec from a path found in a core file.
+///
+/// This method will guess the path style more intelligently that specifying
+/// a native path style since core files can contain paths from a different
+/// system than the host system.
+static FileSpec CreateFileSpecFromPath(llvm::StringRef path) {
+  FileSpec::Style path_style = FileSpec::Style::native;
+  if (auto guessed_style = FileSpec::GuessPathStyle(path))
+    path_style = *guessed_style;
+  return FileSpec(path, path_style);
+}
+
+bool ProcessElfCore::GetMainExecutableModuleSpec(ModuleSpec &exe_spec) {
   AuxVector aux_vector(m_auxv);
-  std::string execfn_str;
+  exe_spec.GetArchitecture() = GetTarget().GetArchitecture();
+
+  // Find the NT_FILE_Entry for the main executable's ELF header.
+  std::optional<NT_FILE_Entry> exe_header =
+      GetNTFileEntryForExecutableELFHeader();
+  if (exe_header) {
+    exe_spec.GetFileSpec() = CreateFileSpecFromPath(exe_header->path);
+    exe_spec.GetUUID() = FindModuleUUID(exe_header->path);
+  }
+
+  // If we failed to find the executable program in the NT_FILE list with the
+  // program header address, then we can read the executable name from the value
+  // of the AUXV_AT_EXECFN in the AUX vector. The reason we don't use this file
+  // all of the time is if the program is launched using a symlink, the value of
+  // the AUXV_AT_EXECFN string will be the symlink itself. The same goes for the
+  // m_executable_name found in the NT_PRPSINFO section, it will be the name of
+  // the symlink. Even if we did find a path above, we want to fill in this path
+  // if it is different from main executable's path in the platform file name
+  // in case someone needs to know how the executable was launched.
   if (auto execfn = aux_vector.GetAuxValue(AuxVector::AUXV_AT_EXECFN)) {
     Status error;
-    if (ReadCStringFromMemory(*execfn, execfn_str, error))
-      return execfn_str;
+    std::string execfn_str;
+    if (ReadCStringFromMemory(*execfn, execfn_str, error)) {
+      // This path can be a symlink path. Set it as the main file spec if one
+      // hasn't been set, else set the platform file spec.
+      FileSpec execfn_spec = CreateFileSpecFromPath(execfn_str);
+      if (exe_spec.GetFileSpec()) {
+        // Fill in the platform file spec if it differs from the main path from
+        // the resolved file info in the NT_FILE note.
+        if (exe_spec.GetFileSpec() != execfn_spec)
+          exe_spec.GetPlatformFileSpec() = execfn_spec;
+      } else {
+        // We don't have an executable file spec yet, lets set it.
+        exe_spec.GetFileSpec() = execfn_spec;
+        exe_spec.GetUUID() = FindModuleUUID(execfn_str);
+      }
+    }
   }
 
-  if (m_nt_file_entries.empty())
-    return {};
-
-  // The first entry in the NT_FILE might be our executable
-  std::string executable_path = m_nt_file_entries[0].path;
-  // Prefer the NT_FILE entry matching m_executable_name as main executable.
-  for (const NT_FILE_Entry &file_entry : m_nt_file_entries)
-    if (llvm::StringRef(file_entry.path).ends_with("/" + m_executable_name)) {
-      executable_path = file_entry.path;
-      break;
-    }
-  return executable_path;
+  // If we didn't set the executable file spec yet, lets set it from the info
+  // from the NT_PRPSINFO. This usually is just a basename of the actual path
+  // used to launch the binary, so this can be a symlink basename. But it will
+  // be better than nothing since we will create a placeholder module for any
+  // files that don't exist.
+  if (!exe_spec.GetFileSpec() && !m_executable_name.empty())
+    exe_spec.GetFileSpec() = CreateFileSpecFromPath(m_executable_name);
+
+  // We succeeded if we got a path.
+  return (bool)exe_spec.GetFileSpec();
 }
 
 UUID ProcessElfCore::FindModuleUUID(const llvm::StringRef path) {
@@ -1167,3 +1208,51 @@ bool ProcessElfCore::GetProcessInfo(ProcessInstanceInfo &info) {
   info.SetArguments(m_process_args.as_args(), /*first_arg_is_executable=*/true);
   return true;
 }
+
+/// Find the NT_FILE entry that contains an address.
+std::optional<ProcessElfCore::NT_FILE_Entry>
+ProcessElfCore::GetNTFileEntryContainingAddress(lldb::addr_t addr) {
+  for (const NT_FILE_Entry &file_entry : m_nt_file_entries) {
+    if (file_entry.start <= addr && addr < file_entry.end)
+      return file_entry;
+  }
+  return std::nullopt;
+}
+
+std::optional<ProcessElfCore::NT_FILE_Entry>
+ProcessElfCore::GetNTFileEntryForExecutableELFHeader() {
+  /// This method will search for the first NT_FILE entry that contains the
+  /// executable's ELF header. We use the AUXV_AT_PHDR from the aux vector to
+  /// find the address of the main executable's program headers and then find
+  /// the NT_FILE entry that contains this address.
+  ///
+  /// Previously we would try to find the first NT_FILE entry that had a path
+  /// that ended with the executable name found in the NT_PRPSINFO note, but
+  /// this basename can be the name of a symlink and not the actual resolved
+  /// executable file found in the NT_FILE entry so this could fail for cases
+  /// where a symlink was used to launch the program, and that symlink's
+  /// base name was different from the resolved executable file's name in
+  /// the NT_FILE entry.
+  if (m_nt_file_entries.empty())
+    return std::nullopt;
+  // The AUX vector has the load address of the program headers from the main
+  // executable as the value for AUXV_AT_PHDR. We can use this value to find
+  // the NT_FILE entry that contains this address and this will locate the main
+  // executable's mapping that contains the ELF header.
+  AuxVector aux_vector(m_auxv);
+  if (std::optional<uint64_t> opt_value =
+          aux_vector.GetAuxValue(AuxVector::AUXV_AT_PHDR)) {
+    if (std::optional<NT_FILE_Entry> nt =
+            GetNTFileEntryContainingAddress(*opt_value))
+      return *nt;
+  }
+  // Fall back to trying to find the first NT_FILE entry that contains the entry
+  // point address.
+  if (std::optional<uint64_t> opt_value =
+          aux_vector.GetAuxValue(AuxVector::AUXV_AT_ENTRY)) {
+    if (std::optional<NT_FILE_Entry> nt =
+            GetNTFileEntryContainingAddress(*opt_value))
+      return *nt;
+  }
+  return std::nullopt;
+}
diff --git a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
index 2b6b34075252f..e6f1fa0027554 100644
--- a/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
+++ b/lldb/source/Plugins/Process/elf-core/ProcessElfCore.h
@@ -172,8 +172,8 @@ class ProcessElfCore : public lldb_private::PostMortemProcess {
 
   lldb_private::UUID FindModuleUUID(const llvm::StringRef path) override;
 
-  // Returns the main executable path.
-  std::string GetMainExecutablePath();
+  // Extract the executable module spec for the executable in this core file.
+  bool GetMainExecutableModuleSpec(lldb_private::ModuleSpec &exe_spec);
 
   // Returns the value of certain type of note of a given start address
   lldb_private::UUID FindBuidIdInCoreMemory(lldb::addr_t address);
@@ -192,6 +192,12 @@ class ProcessElfCore : public lldb_private::PostMortemProcess {
   llvm::Error parseNetBSDNotes(llvm::ArrayRef<lldb_private::CoreNote> notes);
   llvm::Error parseOpenBSDNotes(llvm::ArrayRef<lldb_private::CoreNote> notes);
   llvm::Error parseLinuxNotes(llvm::ArrayRef<lldb_private::CoreNote> notes);
+
+  /// Find the NT_FILE entry that contains an address.
+  std::optional<NT_FILE_Entry>
+  GetNTFileEntryContainingAddress(lldb::addr_t addr);
+  /// Intelligently find the NT_FILE entry for the executable's ELF header.
+  std::optional<NT_FILE_Entry> GetNTFileEntryForExecutableELFHeader();
 };
 
 #endif // LLDB_SOURCE_PLUGINS_PROCESS_ELF_CORE_PROCESSELFCORE_H
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
index 959339c2c6ca0..da7f0ca7f9e71 100644
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -1101,7 +1101,7 @@ def test_linux_no_exe(self):
         libraries are available. The "image list" output should look like:
 
         (lldb) image list
-        [  0] 7BCC1101 0x000055bb04288000 /data/users/gclayton/args/elf-crash (0x000055bb04288000)
+        [  0] 9FD61477 0x000055bb04288000 /data/users/gclayton/args/elf-crash (0x000055bb04288000)
         [  1]                                      0x00007f27db200000 /libxx/libstdc++.so.6
         [  2] AF275675-4671-8B49-24C8-A9A657D74115-C80DEE65 0x00007f27db51b000 /libxx/libm.so.6 (0x00007f27db51b000)
         [  3]                                      0x00007f27db4fe000 /libxx/libgcc_s.so.1
@@ -1119,7 +1119,7 @@ def test_linux_no_exe(self):
         self.assertEqual(
             m.GetObjectFileHeaderAddress().GetLoadAddress(target), 0x000055BB04288000
         )
-        self.assertEqual(m.GetUUIDString(), "7BCC1101")
+        self.assertEqual(m.GetUUIDString(), "9FD61477")
 
         m = target.module["/libxx/libstdc++.so.6"]
         self.assertTrue(m.IsValid())
@@ -1284,6 +1284,55 @@ def do_test(self, filename, pid, region_count, thread_name):
 
         self.dbg.DeleteTarget(target)
 
+    def test_exe_name_extraction_nt_file(self):
+        # This core file has:
+        # - NT_FILE entry for the executable with path '/path/nt_file_foo
+        # - AT_EXECFN that points to "/path/execfn_foo"
+        # - NT_PRPSINFO with a pr_fname member set to 'prpsinfo_foo'
+        # We expect the NT_FILE version to be found since this is a resolved
+        # file path and it is the best information we can use for the executable
+        # name.
+        yaml_path = self.getSourcePath("elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml")
+        core_path = self.getBuildArtifact("elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.core")
+        self.yaml2obj(yaml_path, core_path)
+        target = self.dbg.CreateTarget(None)
+        process = target.LoadCore(core_path)
+        exe_module = target.modules[0]
+        self.assertEqual(exe_module.GetFileSpec().fullpath, "/path/nt_file_foo")
+        self.dbg.DeleteTarget(target)
+
+    def test_exe_name_extraction_at_execfn(self):
+        # This core file has:
+        # - AT_EXECFN that points to "/path/execfn_foo"
+        # - NT_PRPSINFO with a pr_fname member set to 'prpsinfo_foo'
+        # There is no NT_FILE in this core file, so we expect the fall back to
+        # the AT_EXECFN name in memory as it has a full path to the executable.
+        # This path can differ from the path found in NT_FILE as it might not
+        # be resolved as it can be a symlink path.
+        yaml_path = self.getSourcePath("elf-NT_PRPSINFO-AT_EXECFN.yaml")
+        core_path = self.getBuildArtifact("elf-NT_PRPSINFO-AT_EXECFN.core")
+        self.yaml2obj(yaml_path, core_path)
+        target = self.dbg.CreateTarget(None)
+        process = target.LoadCore(core_path)
+        exe_module = target.modules[0]
+        self.assertEqual(exe_module.GetFileSpec().fullpath, "/path/execfn_foo")
+        self.dbg.DeleteTarget(target)
+
+    def test_exe_name_extraction_nt_prpsinfo(self):
+        # This core file has:
+        # - NT_PRPSINFO with a pr_fname member set to 'prpsinfo_foo'
+        # There is no NT_FILE or AT_EXECFN in the aux vector in this core file.
+        # We expect the fall back to the info in the NT_PRPSINFO note.
+        yaml_path = self.getSourcePath("elf-NT_PRPSINFO.yaml")
+        core_path = self.getBuildArtifact("elf-NT_PRPSINFO.core")
+        self.yaml2obj(yaml_path, core_path)
+        target = self.dbg.CreateTarget(None)
+        process = target.LoadCore(core_path)
+        exe_module = target.modules[0]
+        self.assertEqual(exe_module.GetFileSpec().fullpath, "prpsinfo_foo")
+        self.dbg.DeleteTarget(target)
+
+
 
 def replace_path(binary, replace_from, replace_to):
     src = replace_from.encode()
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml
new file mode 100644
index 0000000000000..5bc949ade451f
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_FILE-NT_PRPSINFO-AT_EXECFN.yaml
@@ -0,0 +1,29 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_CORE
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_NOTE
+    Align:           0x4
+    FileSize:        0x2c4
+    Offset:          0xb0
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    VAddr:           0x10000
+    Align:           0x4
+    FileSize:        0x11
+    MemSize:         0x1f
+    Offset:          0x374
+Sections:
+  - Type:            Fill
+    Pattern:         0600000030000000060000004c494e5558000000030000000000000040802804bb5500001f00000000000000000001000000000000000000000000000000000000000000050000005001000001000000434f5245000000000b00000000000000000000000b000000000000000000000000000000000000004a4433005f7220004a4433005f7220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e063db277f0000b8bd2804bb55000030912804bb550000f8191ec0ff7f0000e0181ec0ff7f00000000000000000000a0acffda277f0000e90000000000000060c060db277f000050c4ffda277f000000000000000000000100000000000000081a1ec0ff7f0000f8191ec0ff7f00000100000000000000ffffffffffffffff4e912804bb55000033000000000000004602010000000000e0181ec0ff7f00002b0000000000000040974fdb277f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000050000006a000000454c4946434f5245000000000200000000000000001000000000000000802804bb55000000902804bb55000000000000000000000000e07e147f000000c0e27e147f000020000000000000002f706174682f6e745f66696c655f666f6f002f706174682f6e745f66696c652f6c6962632e736f2e36000000050000008800000003000000434f524500000000024400000000000008044040000000003000000030000000510d0000360d0000d9040000d904000070727073696e666f5f666f6f000000002f706174682f70727073696e666f5f666f6f202d2d766572626f736500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+    Size:            0x2c4
+    Offset:          0xb0
+  - Type:            Fill
+    Pattern:         2f706174682f65786563666e5f666f6f00
+    Size:            0x11
+    Offset:          0x374
+  - Type:            SectionHeaderTable
+    NoHeaders:       true
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml
new file mode 100644
index 0000000000000..c6834dd7dda8f
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO-AT_EXECFN.yaml
@@ -0,0 +1,29 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_CORE
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_NOTE
+    Align:           0x4
+    FileSize:        0x234
+    Offset:          0xb0
+  - Type:            PT_LOAD
+    Flags:           [ PF_R ]
+    VAddr:           0x10000
+    Align:           0x4
+    FileSize:        0x11
+    MemSize:         0x1f
+    Offset:          0x2e4
+Sections:
+  - Type:            Fill
+    Pattern:         0600000020000000060000004c494e55580000001f00000000000000000001000000000000000000000000000000000000000000050000005001000001000000434f5245000000000b00000000000000000000000b000000000000000000000000000000000000004a4433005f7220004a4433005f7220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e063db277f0000b8bd2804bb55000030912804bb550000f8191ec0ff7f0000e0181ec0ff7f00000000000000000000a0acffda277f0000e90000000000000060c060db277f000050c4ffda277f000000000000000000000100000000000000081a1ec0ff7f0000f8191ec0ff7f00000100000000000000ffffffffffffffff4e912804bb55000033000000000000004602010000000000e0181ec0ff7f00002b0000000000000040974fdb277f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000050000008800000003000000434f524500000000024400000000000008044040000000003000000030000000510d0000360d0000d9040000d904000070727073696e666f5f666f6f000000002f706174682f70727073696e666f5f666f6f202d2d766572626f736500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+    Size:            0x234
+    Offset:          0xb0
+  - Type:            Fill
+    Pattern:         2f706174682f65786563666e5f666f6f00
+    Size:            0x11
+    Offset:          0x2e4
+  - Type:            SectionHeaderTable
+    NoHeaders:       true
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml
new file mode 100644
index 0000000000000..99b5c5dde0903
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/elf-core/elf-NT_PRPSINFO.yaml
@@ -0,0 +1,18 @@
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_CORE
+  Machine:         EM_X86_64
+ProgramHeaders:
+  - Type:            PT_NOTE
+    Align:           0x4
+    FileSize:        0x224
+    Offset:          0x78
+Sections:
+  - Type:            Fill
+    Pattern:         0600000010000000060000004c494e555800000000000000000000000000000000000000050000005001000001000000434f5245000000000b00000000000000000000000b000000000000000000000000000000000000004a4433005f7220004a4433005f7220000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000e063db277f0000b8bd2804bb55000030912804bb550000f8191ec0ff7f0000e0181ec0ff7f00000000000000000000a0acffda277f0000e90000000000000060c060db277f000050c4ffda277f000000000000000000000100000000000000081a1ec0ff7f0000f8191ec0ff7f00000100000000000000ffffffffffffffff4e912804bb55000033000000000000004602010000000000e0181ec0ff7f00002b0000000000000040974fdb277f0000000000000000000000000000000000000000000000000000000000000000000000000000000000000100000000000000050000008800000003000000434f524500000000024400000000000008044040000000003000000030000000510d0000360d0000d9040000d904000070727073696e666f5f666f6f000000002f706174682f70727073696e666f5f666f6f202d2d766572626f736500000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000
+    Size:            0x224
+    Offset:          0x78
+  - Type:            SectionHeaderTable
+    NoHeaders:       true

From 7e735ea180fe6d199dd71dacf4396bcbbb3bbb3c Mon Sep 17 00:00:00 2001
From: Jordan R AW <ajordanr@google.com>
Date: Wed, 13 May 2026 16:48:31 -0700
Subject: [PATCH 04/95] [compiler-rt][cmake] Fix check_cxx_compiler_flag calls
 (#197529)

check_cxx_compiler_flag, when passing multiple flags, we must separate
them using a SEMICOLON-separated list. Not spaces. These checks
succeed incorrectly sometimes because "-Werror -mcrc" has a different
return value than "-Werror" "-mcrc" on some systems.

This issue was verified with LLVM_ENABLE_PROJECTS=llvm;compiler-rt,
and I'm uncertain whether it exists in runtime CMake builds.
Nonetheless, it's still a bug.

See:
https://cmake.org/cmake/help/latest/module/CheckCXXCompilerFlag.html

This issue was identified downstream in ChromiumOS.

ChromiumOS Bug:
https://issuetracker.google.com/507177988
---
 compiler-rt/cmake/config-ix.cmake | 68 +++++++++++++++----------------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/compiler-rt/cmake/config-ix.cmake b/compiler-rt/cmake/config-ix.cmake
index ad5497ba81b5b..d1d89c0a29f8a 100644
--- a/compiler-rt/cmake/config-ix.cmake
+++ b/compiler-rt/cmake/config-ix.cmake
@@ -102,22 +102,22 @@ check_cxx_compiler_flag(-fno-sanitize=safe-stack COMPILER_RT_HAS_FNO_SANITIZE_SA
 check_cxx_compiler_flag(-fvisibility=hidden  COMPILER_RT_HAS_FVISIBILITY_HIDDEN_FLAG)
 check_cxx_compiler_flag(-frtti               COMPILER_RT_HAS_FRTTI_FLAG)
 check_cxx_compiler_flag(-fno-rtti            COMPILER_RT_HAS_FNO_RTTI_FLAG)
-check_cxx_compiler_flag("-Werror -fno-function-sections" COMPILER_RT_HAS_FNO_FUNCTION_SECTIONS_FLAG)
+check_cxx_compiler_flag("-Werror;-fno-function-sections" COMPILER_RT_HAS_FNO_FUNCTION_SECTIONS_FLAG)
 check_cxx_compiler_flag(-ftls-model=initial-exec COMPILER_RT_HAS_FTLS_MODEL_INITIAL_EXEC)
 check_cxx_compiler_flag(-fno-lto             COMPILER_RT_HAS_FNO_LTO_FLAG)
 check_cxx_compiler_flag(-fno-profile-generate COMPILER_RT_HAS_FNO_PROFILE_GENERATE_FLAG)
 check_cxx_compiler_flag(-fno-profile-instr-generate COMPILER_RT_HAS_FNO_PROFILE_INSTR_GENERATE_FLAG)
 check_cxx_compiler_flag(-fno-profile-instr-use COMPILER_RT_HAS_FNO_PROFILE_INSTR_USE_FLAG)
 check_cxx_compiler_flag(-fno-coverage-mapping COMPILER_RT_HAS_FNO_COVERAGE_MAPPING_FLAG)
-check_cxx_compiler_flag("-Werror -mcrc32"    COMPILER_RT_HAS_MCRC32_FLAG)
-check_cxx_compiler_flag("-Werror -msse4.2"   COMPILER_RT_HAS_MSSE4_2_FLAG)
+check_cxx_compiler_flag("-Werror;-mcrc32"    COMPILER_RT_HAS_MCRC32_FLAG)
+check_cxx_compiler_flag("-Werror;-msse4.2"   COMPILER_RT_HAS_MSSE4_2_FLAG)
 check_cxx_compiler_flag(--sysroot=.          COMPILER_RT_HAS_SYSROOT_FLAG)
-check_cxx_compiler_flag("-Werror -mcrc"      COMPILER_RT_HAS_MCRC_FLAG)
+check_cxx_compiler_flag("-Werror;-mcrc"      COMPILER_RT_HAS_MCRC_FLAG)
 check_cxx_compiler_flag(-fno-partial-inlining COMPILER_RT_HAS_FNO_PARTIAL_INLINING_FLAG)
-check_cxx_compiler_flag("-Werror -ftrivial-auto-var-init=pattern" COMPILER_RT_HAS_TRIVIAL_AUTO_INIT)
+check_cxx_compiler_flag("-Werror;-ftrivial-auto-var-init=pattern" COMPILER_RT_HAS_TRIVIAL_AUTO_INIT)
 check_c_compiler_flag(-nogpulib             COMPILER_RT_HAS_NOGPULIB_FLAG)
 check_c_compiler_flag(-flto                 COMPILER_RT_HAS_FLTO_FLAG)
-check_c_compiler_flag("-Xclang -mcode-object-version=none" COMPILER_RT_HAS_CODE_OBJECT_VERSION_FLAG)
+check_c_compiler_flag("-Xclang;-mcode-object-version=none" COMPILER_RT_HAS_CODE_OBJECT_VERSION_FLAG)
 
 if(NOT WIN32 AND NOT CYGWIN)
   # MinGW warns if -fvisibility-inlines-hidden is used.
@@ -137,24 +137,24 @@ check_cxx_compiler_flag(/Zi COMPILER_RT_HAS_Zi_FLAG)
 # Warnings.
 check_cxx_compiler_flag(-Wall COMPILER_RT_HAS_WALL_FLAG)
 check_cxx_compiler_flag(-Werror COMPILER_RT_HAS_WERROR_FLAG)
-check_cxx_compiler_flag("-Werror -Wframe-larger-than=512" COMPILER_RT_HAS_WFRAME_LARGER_THAN_FLAG)
-check_cxx_compiler_flag("-Werror -Wglobal-constructors"   COMPILER_RT_HAS_WGLOBAL_CONSTRUCTORS_FLAG)
-check_cxx_compiler_flag("-Werror -Wc99-extensions"     COMPILER_RT_HAS_WC99_EXTENSIONS_FLAG)
-check_cxx_compiler_flag("-Werror -Wgnu"                COMPILER_RT_HAS_WGNU_FLAG)
-check_cxx_compiler_flag("-Werror -Wgnu-anonymous-struct" COMPILER_RT_HAS_WGNU_ANONYMOUS_STRUCT_FLAG)
-check_cxx_compiler_flag("-Werror -Wvariadic-macros"    COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG)
-check_cxx_compiler_flag("-Werror -Wunused-parameter"   COMPILER_RT_HAS_WUNUSED_PARAMETER_FLAG)
-check_cxx_compiler_flag("-Werror -Wcovered-switch-default" COMPILER_RT_HAS_WCOVERED_SWITCH_DEFAULT_FLAG)
-check_cxx_compiler_flag("-Werror -Wsuggest-override"   COMPILER_RT_HAS_WSUGGEST_OVERRIDE_FLAG)
-check_cxx_compiler_flag("-Werror -Wthread-safety" COMPILER_RT_HAS_WTHREAD_SAFETY_FLAG)
-check_cxx_compiler_flag("-Werror -Wthread-safety-reference" COMPILER_RT_HAS_WTHREAD_SAFETY_REFERENCE_FLAG)
-check_cxx_compiler_flag("-Werror -Wthread-safety-beta" COMPILER_RT_HAS_WTHREAD_SAFETY_BETA_FLAG)
+check_cxx_compiler_flag("-Werror;-Wframe-larger-than=512" COMPILER_RT_HAS_WFRAME_LARGER_THAN_FLAG)
+check_cxx_compiler_flag("-Werror;-Wglobal-constructors"   COMPILER_RT_HAS_WGLOBAL_CONSTRUCTORS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wc99-extensions"     COMPILER_RT_HAS_WC99_EXTENSIONS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wgnu"                COMPILER_RT_HAS_WGNU_FLAG)
+check_cxx_compiler_flag("-Werror;-Wgnu-anonymous-struct" COMPILER_RT_HAS_WGNU_ANONYMOUS_STRUCT_FLAG)
+check_cxx_compiler_flag("-Werror;-Wvariadic-macros"    COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wunused-parameter"   COMPILER_RT_HAS_WUNUSED_PARAMETER_FLAG)
+check_cxx_compiler_flag("-Werror;-Wcovered-switch-default" COMPILER_RT_HAS_WCOVERED_SWITCH_DEFAULT_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsuggest-override"   COMPILER_RT_HAS_WSUGGEST_OVERRIDE_FLAG)
+check_cxx_compiler_flag("-Werror;-Wthread-safety" COMPILER_RT_HAS_WTHREAD_SAFETY_FLAG)
+check_cxx_compiler_flag("-Werror;-Wthread-safety-reference" COMPILER_RT_HAS_WTHREAD_SAFETY_REFERENCE_FLAG)
+check_cxx_compiler_flag("-Werror;-Wthread-safety-beta" COMPILER_RT_HAS_WTHREAD_SAFETY_BETA_FLAG)
 check_cxx_compiler_flag(-Wno-pedantic COMPILER_RT_HAS_WNO_PEDANTIC)
 check_cxx_compiler_flag(-Wno-format COMPILER_RT_HAS_WNO_FORMAT)
 check_cxx_compiler_flag(-Wno-format-pedantic COMPILER_RT_HAS_WNO_FORMAT_PEDANTIC)
 
 if(MSVC AND NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  check_cxx_compiler_flag("/experimental:external /external:W0" COMPILER_RT_HAS_EXTERNAL_FLAG)
+  check_cxx_compiler_flag("/experimental:external;/external:W0" COMPILER_RT_HAS_EXTERNAL_FLAG)
 else()
   set(COMPILER_RT_HAS_EXTERNAL_FLAG FALSE)
 endif()
@@ -169,21 +169,21 @@ check_cxx_compiler_flag(/wd4391 COMPILER_RT_HAS_WD4391_FLAG)
 check_cxx_compiler_flag(/wd4722 COMPILER_RT_HAS_WD4722_FLAG)
 check_cxx_compiler_flag(/wd4800 COMPILER_RT_HAS_WD4800_FLAG)
 
-check_cxx_compiler_flag("-Werror -Warray-bounds" COMPILER_RT_HAS_ARRAY_BOUNDS_FLAG)
-check_cxx_compiler_flag("-Werror -Wuninitialized" COMPILER_RT_HAS_UNINITIALIZED_FLAG)
-check_cxx_compiler_flag("-Werror -Wshadow" COMPILER_RT_HAS_SHADOW_FLAG)
-check_cxx_compiler_flag("-Werror -Wempty-body" COMPILER_RT_HAS_EMPTY_BODY_FLAG)
-check_cxx_compiler_flag("-Werror -Wsizeof-pointer-memaccess" COMPILER_RT_HAS_SIZEOF_POINTER_MEMACCESS_FLAG)
-check_cxx_compiler_flag("-Werror -Wsizeof-array-argument" COMPILER_RT_HAS_SIZEOF_ARRAY_ARGUMENT_FLAG)
-check_cxx_compiler_flag("-Werror -Wsuspicious-memaccess" COMPILER_RT_HAS_SUSPICIOUS_MEMACCESS_FLAG)
-check_cxx_compiler_flag("-Werror -Wbuiltin-memcpy-chk-size" COMPILER_RT_HAS_BUILTIN_MEMCPY_CHK_SIZE_FLAG)
-check_cxx_compiler_flag("-Werror -Warray-bounds-pointer-arithmetic" COMPILER_RT_HAS_ARRAY_BOUNDS_POINTER_ARITHMETIC_FLAG)
-check_cxx_compiler_flag("-Werror -Wreturn-stack-address" COMPILER_RT_HAS_RETURN_STACK_ADDRESS_FLAG)
-check_cxx_compiler_flag("-Werror -Wsizeof-array-decay" COMPILER_RT_HAS_SIZEOF_ARRAY_DECAY_FLAG)
-check_cxx_compiler_flag("-Werror -Wformat-insufficient-args" COMPILER_RT_HAS_FORMAT_INSUFFICIENT_ARGS_FLAG)
-check_cxx_compiler_flag("-Werror -Wformat-security" COMPILER_RT_HAS_BUILTIN_FORMAL_SECURITY_FLAG)
-check_cxx_compiler_flag("-Werror -Wsizeof-array-div" COMPILER_RT_HAS_SIZEOF_ARRAY_DIV_FLAG)
-check_cxx_compiler_flag("-Werror -Wsizeof-pointer-div" COMPILER_RT_HAS_SIZEOF_POINTER_DIV_FLAG)
+check_cxx_compiler_flag("-Werror;-Warray-bounds" COMPILER_RT_HAS_ARRAY_BOUNDS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wuninitialized" COMPILER_RT_HAS_UNINITIALIZED_FLAG)
+check_cxx_compiler_flag("-Werror;-Wshadow" COMPILER_RT_HAS_SHADOW_FLAG)
+check_cxx_compiler_flag("-Werror;-Wempty-body" COMPILER_RT_HAS_EMPTY_BODY_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsizeof-pointer-memaccess" COMPILER_RT_HAS_SIZEOF_POINTER_MEMACCESS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsizeof-array-argument" COMPILER_RT_HAS_SIZEOF_ARRAY_ARGUMENT_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsuspicious-memaccess" COMPILER_RT_HAS_SUSPICIOUS_MEMACCESS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wbuiltin-memcpy-chk-size" COMPILER_RT_HAS_BUILTIN_MEMCPY_CHK_SIZE_FLAG)
+check_cxx_compiler_flag("-Werror;-Warray-bounds-pointer-arithmetic" COMPILER_RT_HAS_ARRAY_BOUNDS_POINTER_ARITHMETIC_FLAG)
+check_cxx_compiler_flag("-Werror;-Wreturn-stack-address" COMPILER_RT_HAS_RETURN_STACK_ADDRESS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsizeof-array-decay" COMPILER_RT_HAS_SIZEOF_ARRAY_DECAY_FLAG)
+check_cxx_compiler_flag("-Werror;-Wformat-insufficient-args" COMPILER_RT_HAS_FORMAT_INSUFFICIENT_ARGS_FLAG)
+check_cxx_compiler_flag("-Werror;-Wformat-security" COMPILER_RT_HAS_BUILTIN_FORMAL_SECURITY_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsizeof-array-div" COMPILER_RT_HAS_SIZEOF_ARRAY_DIV_FLAG)
+check_cxx_compiler_flag("-Werror;-Wsizeof-pointer-div" COMPILER_RT_HAS_SIZEOF_POINTER_DIV_FLAG)
 
 # Symbols.
 check_symbol_exists(__func__ "" COMPILER_RT_HAS_FUNC_SYMBOL)

From 3ccc2762ead9d5d4a68206afc0b062f226c1ed00 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <jdoerfert.llvm@gmail.com>
Date: Wed, 13 May 2026 16:50:47 -0700
Subject: [PATCH 05/95] [Instrumentor][FIX] Fix oversight in docs heading
 (#197594)

---
 llvm/docs/Instrumentor.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/docs/Instrumentor.rst b/llvm/docs/Instrumentor.rst
index cbabfcde6b37d..b908122599d53 100644
--- a/llvm/docs/Instrumentor.rst
+++ b/llvm/docs/Instrumentor.rst
@@ -114,7 +114,7 @@ This enables use cases like:
 - Taint tracking
 
 Instrumentation Filtering
-----------------
+-------------------------
 
 The Instrumentor provides fine-grained control over what gets instrumented:
 

From ccc903863af2335e1ff7bb0dc65ebc1fc0ec6e57 Mon Sep 17 00:00:00 2001
From: sstwcw <su3e8a96kzlver@posteo.net>
Date: Wed, 13 May 2026 23:51:58 +0000
Subject: [PATCH 06/95] [clang-format] Handle more Verilog attributes (#196455)

before

```SystemVerilog
(* x = "x" *) foreach(x[x]) x = x;
```

after

```SystemVerilog
(* x = "x" *) foreach (x[x])
  x = x;
```

The code for handling statements like the `foreach` preceded the part
for handling the attributes inside `(* *)`. So there was a problem with
some of the statements following attributes. The patch moves the part
for the statements down. The loop in the code was also unnecessary.
---
 clang/lib/Format/UnwrappedLineParser.cpp     | 24 +++++++++-----------
 clang/unittests/Format/FormatTestVerilog.cpp |  2 ++
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 022fd62ed2bfc..2da8cf93d4a0a 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1451,6 +1451,17 @@ void UnwrappedLineParser::parseStructuralElement(
     while (FormatTok->is(tok::l_square) && handleCppAttributes()) {
     }
   } else if (Style.isVerilog()) {
+    // Skip attributes.
+    while (FormatTok->is(tok::l_paren) &&
+           Tokens->peekNextToken()->is(tok::star)) {
+      parseParens();
+    }
+    // Skip things that can exist before keywords like 'if' and 'case'.
+    if (FormatTok->isOneOf(Keywords.kw_priority, Keywords.kw_unique,
+                           Keywords.kw_unique0)) {
+      nextToken();
+    }
+
     if (Keywords.isVerilogStructuredProcedure(*FormatTok)) {
       parseForOrWhileLoop(/*HasParens=*/false);
       return;
@@ -1464,19 +1475,6 @@ void UnwrappedLineParser::parseStructuralElement(
       parseIfThenElse(IfKind, /*KeepBraces=*/false, /*IsVerilogAssert=*/true);
       return;
     }
-
-    // Skip things that can exist before keywords like 'if' and 'case'.
-    while (true) {
-      if (FormatTok->isOneOf(Keywords.kw_priority, Keywords.kw_unique,
-                             Keywords.kw_unique0)) {
-        nextToken();
-      } else if (FormatTok->is(tok::l_paren) &&
-                 Tokens->peekNextToken()->is(tok::star)) {
-        parseParens();
-      } else {
-        break;
-      }
-    }
   }
 
   // Tokens that only make sense at the beginning of a line.
diff --git a/clang/unittests/Format/FormatTestVerilog.cpp b/clang/unittests/Format/FormatTestVerilog.cpp
index 23ff1158e00cf..66295b441d3ce 100644
--- a/clang/unittests/Format/FormatTestVerilog.cpp
+++ b/clang/unittests/Format/FormatTestVerilog.cpp
@@ -1013,6 +1013,8 @@ TEST_F(FormatTestVerilog, Instantiation) {
 TEST_F(FormatTestVerilog, Loop) {
   verifyFormat("foreach (x[x])\n"
                "  x = x;");
+  verifyFormat("(* x = \"x\" *) foreach (x[x])\n"
+               "  x = x;");
   verifyFormat("repeat (x)\n"
                "  x = x;");
   verifyFormat("foreach (x[x]) begin\n"

From fe787a82954875c7926d11125721461558db6be7 Mon Sep 17 00:00:00 2001
From: "forking-google-bazel-bot[bot]"
 <265904573+forking-google-bazel-bot[bot]@users.noreply.github.com>
Date: Wed, 13 May 2026 16:59:27 -0700
Subject: [PATCH 07/95] [Bazel] Fixes 882d025 (#197593)

This fixes 882d0251d44fa2db8dd6e5817a8baa72237f77c7.

Co-authored-by: Google Bazel Bot <google-bazel-bot@google.com>
---
 utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 6b64764110c73..ff7ee4f8567f7 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -2303,6 +2303,7 @@ cc_library(
     deps = [
         ":PluginDynamicLoaderPosixDYLDHeaders",
         ":PluginObjectFileELF",
+        ":PluginObjectFilePlaceholder",
         ":PluginProcessUtility",
         "//lldb:Core",
         "//lldb:Target",

From 37c59162aba91858a90965d88ebc2fa4972e6f18 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Wed, 13 May 2026 17:01:48 -0700
Subject: [PATCH 08/95] [BOLT][NFCI] Drop CFG profile attachment in
 DataAggregator (#195986)

---
 bolt/include/bolt/Profile/DataAggregator.h | 14 +----
 bolt/lib/Profile/DataAggregator.cpp        | 71 +++++-----------------
 2 files changed, 15 insertions(+), 70 deletions(-)

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index f7c9e31915d74..d3ff37aa1a801 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -234,21 +234,9 @@ class DataAggregator : public DataReader {
   /// Return a vector of offsets corresponding to a trace in a function
   /// if the trace is valid, std::nullopt otherwise.
   std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
-  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace, uint64_t Count,
+  getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
                          bool IsReturn) const;
 
-  /// Record external entry into the function \p BF.
-  ///
-  /// Return true if the entry is valid, false otherwise.
-  bool recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
-                   uint64_t Count = 1) const;
-
-  /// Record exit from the function \p BF via a call or return.
-  ///
-  /// Return true if the exit point is valid, false otherwise.
-  bool recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
-                  uint64_t Count = 1) const;
-
   /// Branch stacks aggregation statistics
   uint64_t NumTraces{0};
   uint64_t NumInvalidTraces{0};
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 344682f9ae2f4..b4e4417306a38 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -717,10 +717,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
 Error DataAggregator::readProfile(BinaryContext &BC) {
   processProfile(BC);
 
-  for (auto &BFI : BC.getBinaryFunctions()) {
-    BinaryFunction &Function = BFI.second;
-    convertBranchData(Function);
-  }
+  if (Error E = DataReader::readProfile(BC))
+    return E;
 
   if (opts::AggregateOnly) {
     if (opts::ProfileFormat == opts::ProfileFormatKind::PF_Fdata)
@@ -747,6 +745,12 @@ bool DataAggregator::mayHaveProfileData(const BinaryFunction &Function) {
 }
 
 void DataAggregator::processProfile(BinaryContext &BC) {
+  // Set for DataReader::readProfile
+  NoLBRMode = opts::BasicAggregation;
+
+  // Set for DataReader::recordBranch and evaluateProfileData
+  BATMode = usesBAT();
+
   if (opts::BasicAggregation)
     processBasicEvents();
   else
@@ -772,6 +776,9 @@ void DataAggregator::processProfile(BinaryContext &BC) {
     llvm::stable_sort(FuncBranches.second.EntryData);
   }
 
+  for (auto &FuncBasicSamples : NamesToBasicSamples)
+    llvm::stable_sort(FuncBasicSamples.second.Data);
+
   for (auto &MemEvents : NamesToMemEvents)
     llvm::stable_sort(MemEvents.second.Data);
 
@@ -880,8 +887,6 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
       FromAggrData->Name = SrcFunc;
       setBranchData(*FromFunc, FromAggrData);
     }
-
-    recordExit(*FromFunc, From, Mispreds, Count);
   }
   if (ToFunc) {
     DstFunc = getLocationName(*ToFunc, BAT);
@@ -891,8 +896,6 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
       ToAggrData->Name = DstFunc;
       setBranchData(*ToFunc, ToAggrData);
     }
-
-    recordEntry(*ToFunc, To, Mispreds, Count);
   }
 
   if (FromAggrData)
@@ -941,10 +944,8 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
     return false;
 
   // Treat recursive control transfers as inter-branches.
-  if (FromFunc == ToFunc && To != 0) {
-    recordBranch(*FromFunc, From, To, Count, Mispreds);
+  if (FromFunc == ToFunc && To != 0)
     return doIntraBranch(*FromFunc, From, To, Count, Mispreds);
-  }
 
   return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds);
 }
@@ -976,7 +977,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
   std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
       BAT && BAT->isBATFunction(FuncAddress)
           ? BAT->getFallthroughsInTrace(FuncAddress, From - IsReturn, To)
-          : getFallthroughsInTrace(*FromFunc, Trace, Count, IsReturn);
+          : getFallthroughsInTrace(*FromFunc, Trace, IsReturn);
   if (!FTs) {
     LLVM_DEBUG(dbgs() << "Invalid trace " << Trace << '\n');
     NumInvalidTraces += Count;
@@ -993,7 +994,7 @@ bool DataAggregator::doTrace(const Trace &Trace, uint64_t Count,
 
 std::optional<SmallVector<std::pair<uint64_t, uint64_t>, 16>>
 DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
-                                       uint64_t Count, bool IsReturn) const {
+                                       bool IsReturn) const {
   SmallVector<std::pair<uint64_t, uint64_t>, 16> Branches;
 
   BinaryContext &BC = BF.getBinaryContext();
@@ -1073,53 +1074,9 @@ DataAggregator::getFallthroughsInTrace(BinaryFunction &BF, const Trace &Trace,
     BB = NextBB;
   }
 
-  // Record fall-through jumps
-  for (const auto &[FromOffset, ToOffset] : Branches) {
-    BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(FromOffset);
-    BinaryBasicBlock *ToBB = BF.getBasicBlockAtOffset(ToOffset);
-    assert(FromBB && ToBB);
-    BinaryBasicBlock::BinaryBranchInfo &BI = FromBB->getBranchInfo(*ToBB);
-    BI.Count += Count;
-  }
-
   return Branches;
 }
 
-bool DataAggregator::recordEntry(BinaryFunction &BF, uint64_t To, bool Mispred,
-                                 uint64_t Count) const {
-  if (To > BF.getSize())
-    return false;
-
-  if (!BF.hasProfile())
-    BF.ExecutionCount = 0;
-
-  BinaryBasicBlock *EntryBB = nullptr;
-  if (To == 0) {
-    BF.ExecutionCount += Count;
-    if (!BF.empty())
-      EntryBB = &BF.front();
-  } else if (BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(To)) {
-    if (BB->isEntryPoint())
-      EntryBB = BB;
-  }
-
-  if (EntryBB)
-    EntryBB->setExecutionCount(EntryBB->getKnownExecutionCount() + Count);
-
-  return true;
-}
-
-bool DataAggregator::recordExit(BinaryFunction &BF, uint64_t From, bool Mispred,
-                                uint64_t Count) const {
-  if (!BF.isSimple() || From > BF.getSize())
-    return false;
-
-  if (!BF.hasProfile())
-    BF.ExecutionCount = 0;
-
-  return true;
-}
-
 ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
   LBREntry Res;
   ErrorOr<StringRef> FromStrRes = parseString('/');

From c5f84148725e69bbc43831aaa88158db50f3722d Mon Sep 17 00:00:00 2001
From: Sadaf Ebrahimi <sadafebrahimi@google.com>
Date: Wed, 13 May 2026 17:11:01 -0700
Subject: [PATCH 09/95] [scudo] Add test for initFlags()

Add a test case to verify that initFlags() correctly reads the
SCUDO_ALLOCATION_RING_BUFFER_SIZE environment variable and updates the
corresponding flag. This increases line coverage for flags.cpp to 100%.
---
 .../lib/scudo/standalone/tests/flags_test.cpp       | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp b/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp
index 0f934b87c38cf..591611edc425a 100644
--- a/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/flags_test.cpp
@@ -121,6 +121,19 @@ TEST(ScudoFlagsTest, AllocatorFlags) {
   EXPECT_EQ(2048, Flags.quarantine_max_chunk_size);
 }
 
+TEST(ScudoFlagsTest, InitFlagsEnv) {
+  const char *OldValue = getenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE");
+  setenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE", "123", 1);
+  scudo::initFlags();
+  scudo::Flags *F = scudo::getFlags();
+  EXPECT_EQ(123, F->allocation_ring_buffer_size);
+  if (OldValue) {
+    setenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE", OldValue, 1);
+  } else {
+    unsetenv("SCUDO_ALLOCATION_RING_BUFFER_SIZE");
+  }
+}
+
 #ifdef GWP_ASAN_HOOKS
 TEST(ScudoFlagsTest, GWPASanFlags) {
   scudo::FlagParser Parser;

From 8ebd85716101c392fca08fa6483628831d6e2865 Mon Sep 17 00:00:00 2001
From: sstwcw <su3e8a96kzlver@posteo.net>
Date: Thu, 14 May 2026 00:15:10 +0000
Subject: [PATCH 10/95] [clang-format][NFC] Correct comment (#197592)

---
 clang/lib/Format/ContinuationIndenter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 485fe382bda3a..338515ec6da21 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1579,7 +1579,7 @@ ContinuationIndenter::getNewLineColumn(const LineState &State) {
   // in ProtoBuf:
   //   optional int32 b = 2 [(foo_options) = {aaaaaaaaaaaaaaaaaaa: 123,
   //                                          bbbbbbbbbbbbbbbbbbbbbbbb:"baz"}];
-  // For Verilog, a quote following a brace is treated as an identifier.  And
+  // For Verilog, a quote preceding a brace is treated as an identifier.  And
   // Both braces and colons get annotated as TT_DictLiteral.  So we have to
   // check.
   if (Current.is(tok::identifier) && Current.Next &&

From d2a57ec6bd427456c879ea44d443f850f1cd68ec Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 13 May 2026 17:21:55 -0700
Subject: [PATCH 11/95] [AMDGPU] Add lit64 machine verifier (#196457)

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp     | 17 +++++++++++++----
 llvm/test/MachineVerifier/AMDGPU/lit64.mir |  9 +++++++++
 2 files changed, 22 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/MachineVerifier/AMDGPU/lit64.mir

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 451b5a4d3da6d..64c070d933ff0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -5231,17 +5231,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
       break;
     case AMDGPU::OPERAND_REG_IMM_INT32:
-    case AMDGPU::OPERAND_REG_IMM_INT64:
     case AMDGPU::OPERAND_REG_IMM_INT16:
     case AMDGPU::OPERAND_REG_IMM_FP32:
-    case AMDGPU::OPERAND_REG_IMM_V2FP32:
     case AMDGPU::OPERAND_REG_IMM_BF16:
     case AMDGPU::OPERAND_REG_IMM_FP16:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
     case AMDGPU::OPERAND_REG_IMM_V2FP16:
     case AMDGPU::OPERAND_REG_IMM_V2FP16_SPLAT:
     case AMDGPU::OPERAND_REG_IMM_V2INT16:
-    case AMDGPU::OPERAND_REG_IMM_V2INT32:
     case AMDGPU::OPERAND_REG_IMM_V2BF16:
       break;
     case AMDGPU::OPERAND_REG_IMM_NOINLINE_V2FP16:
@@ -5266,6 +5262,19 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
       }
       break;
     }
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_V2INT32:
+    case AMDGPU::OPERAND_REG_IMM_V2FP32:
+      if (ST.has64BitLiterals() && Desc.getSize() != 4 && MO.isImm() &&
+          !isInlineConstant(MI, i) &&
+          !AMDGPU::isValid32BitLiteral(MO.getImm(),
+                                       OpInfo.OperandType ==
+                                           AMDGPU::OPERAND_REG_IMM_FP64)) {
+        ErrInfo = "illegal 64-bit immediate value for operand.";
+        return false;
+      }
+      break;
     case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
     case AMDGPU::OPERAND_INPUT_MODS:
       if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
diff --git a/llvm/test/MachineVerifier/AMDGPU/lit64.mir b/llvm/test/MachineVerifier/AMDGPU/lit64.mir
new file mode 100644
index 0000000000000..acee2ae2a1aa5
--- /dev/null
+++ b/llvm/test/MachineVerifier/AMDGPU/lit64.mir
@@ -0,0 +1,9 @@
+# RUN: not --crash llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -verify-machineinstrs -run-pass=none -o - %s 2>&1 | FileCheck %s
+
+---
+
+name: lit64
+body: |
+  bb.0:
+    ; CHECK: illegal 64-bit immediate value for operand.
+    $vgpr0_vgpr1 = V_ADD_F64_e64 0, 68719476721, 0, undef $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $mode

From 009fbc90402dc7bd6509686853de76485cbbbb83 Mon Sep 17 00:00:00 2001
From: Yury Plyakhin <yury.plyakhin@intel.com>
Date: Wed, 13 May 2026 17:33:35 -0700
Subject: [PATCH 12/95] [clang-sycl-linker] Migrate tests from Driver/ to
 Tooling/ and use LLVM IR input (#197566)

1. Replace the C++ source test that required compiling with %clangxx and
separate Input files with self-contained .ll tests using split-file.

2. Split the test into two files:
- clang-sycl-linker.ll: basic tool behavior (link, dev libs, AOT,
errors)
  - clang-sycl-linker-split-mode.ll: device code split mode handling

Co-Authored-By: Claude
---
 clang/test/Driver/Inputs/SYCL/two-kernels.ll  | 23 -----
 clang/test/Driver/clang-sycl-linker-test.cpp  | 99 -------------------
 .../Tooling/clang-sycl-linker-split-mode.ll   | 51 ++++++++++
 clang/test/Tooling/clang-sycl-linker.ll       | 97 ++++++++++++++++++
 4 files changed, 148 insertions(+), 122 deletions(-)
 delete mode 100644 clang/test/Driver/Inputs/SYCL/two-kernels.ll
 delete mode 100644 clang/test/Driver/clang-sycl-linker-test.cpp
 create mode 100644 clang/test/Tooling/clang-sycl-linker-split-mode.ll
 create mode 100644 clang/test/Tooling/clang-sycl-linker.ll

diff --git a/clang/test/Driver/Inputs/SYCL/two-kernels.ll b/clang/test/Driver/Inputs/SYCL/two-kernels.ll
deleted file mode 100644
index c3c90444b7e72..0000000000000
--- a/clang/test/Driver/Inputs/SYCL/two-kernels.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
-target triple = "spirv64"
-
-define spir_func i32 @helper_shared(i32 %a) {
-entry:
-  %r = add nsw i32 %a, 1
-  ret i32 %r
-}
-
-define spir_kernel void @kernel_a(ptr addrspace(1) %out, i32 %a) {
-entry:
-  %r = tail call spir_func i32 @helper_shared(i32 %a)
-  store i32 %r, ptr addrspace(1) %out, align 4
-  ret void
-}
-
-define spir_kernel void @kernel_b(ptr addrspace(1) %out, i32 %a, i32 %b) {
-entry:
-  %h = tail call spir_func i32 @helper_shared(i32 %a)
-  %r = mul nsw i32 %h, %b
-  store i32 %r, ptr addrspace(1) %out, align 4
-  ret void
-}
diff --git a/clang/test/Driver/clang-sycl-linker-test.cpp b/clang/test/Driver/clang-sycl-linker-test.cpp
deleted file mode 100644
index cd99d4d47b1e1..0000000000000
--- a/clang/test/Driver/clang-sycl-linker-test.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// Tests the clang-sycl-linker tool.
-//
-// REQUIRES: spirv-registered-target
-//
-// Test the dry run of a simple case to link two input files.
-// Also verifies the default split mode ("none").
-// RUN: %clangxx -emit-llvm -c -target spirv64 %s -o %t_1.bc
-// RUN: %clangxx -emit-llvm -c -target spirv64 %s -o %t_2.bc
-// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc -o %t-spirv.out 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=SIMPLE-FO
-// SIMPLE-FO:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc  libfiles:  output: [[LLVMLINKOUT:.*]].bc
-// SIMPLE-FO-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
-// SIMPLE-FO-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv
-//
-// Test that IMG_SPIRV image kind is set for non-AOT compilation.
-// RUN: llvm-objdump --offloading %t-spirv.out | FileCheck %s --check-prefix=IMAGE-KIND-SPIRV
-// IMAGE-KIND-SPIRV: kind            spir-v
-//
-// Test the dry run of a simple case with device library files specified.
-// RUN: mkdir -p %t.dir
-// RUN: touch %t.dir/lib1.bc
-// RUN: touch %t.dir/lib2.bc
-// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t_1.bc %t_2.bc --library-path=%t.dir --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=DEVLIBS
-// DEVLIBS:      sycl-device-link: inputs: {{.*}}.bc  libfiles: {{.*}}lib1.bc, {{.*}}lib2.bc  output: [[LLVMLINKOUT:.*]].bc
-// DEVLIBS-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
-// DEVLIBS-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: a_0.spv
-//
-// Test a simple case with a random file (not bitcode) as input.
-// RUN: touch %t.o
-// RUN: not clang-sycl-linker -triple=spirv64 %t.o -o a.spv 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=FILETYPEERROR
-// FILETYPEERROR: Unsupported file type
-//
-// Test to see if device library related errors are emitted.
-// RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t_1.bc %t_2.bc --library-path=%t.dir --device-libs= -o a.spv 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=DEVLIBSERR1
-// DEVLIBSERR1: Number of device library files cannot be zero
-// RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t_1.bc %t_2.bc --library-path=%t.dir --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=DEVLIBSERR2
-// DEVLIBSERR2: '{{.*}}lib3.bc' SYCL device library file is not found
-//
-// Test AOT compilation for an Intel GPU.
-// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=bmg_g21 %t_1.bc %t_2.bc -o %t-aot-gpu.out 2>&1 \
-// RUN:     --ocloc-options="-a -b" \
-// RUN:   | FileCheck %s --check-prefix=AOT-INTEL-GPU
-// AOT-INTEL-GPU:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc
-// AOT-INTEL-GPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
-// AOT-INTEL-GPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv
-// AOT-INTEL-GPU-NEXT: "{{.*}}ocloc{{.*}}" {{.*}}-device bmg_g21 -a -b {{.*}}-output [[SPIRVTRANSLATIONOUT]]_0.out -file [[SPIRVTRANSLATIONOUT]]_0.spv
-//
-// Test that IMG_Object image kind is set for AOT compilation (Intel GPU).
-// RUN: llvm-objdump --offloading %t-aot-gpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT
-// IMAGE-KIND-OBJECT: kind            elf
-//
-// Test AOT compilation for an Intel CPU.
-// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=graniterapids %t_1.bc %t_2.bc -o %t-aot-cpu.out 2>&1 \
-// RUN:     --opencl-aot-options="-a -b" \
-// RUN:   | FileCheck %s --check-prefix=AOT-INTEL-CPU
-// AOT-INTEL-CPU:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc
-// AOT-INTEL-CPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
-// AOT-INTEL-CPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv
-// AOT-INTEL-CPU-NEXT: "{{.*}}opencl-aot{{.*}}" {{.*}}--device=cpu -a -b {{.*}}-o [[SPIRVTRANSLATIONOUT]]_0.out [[SPIRVTRANSLATIONOUT]]_0.spv
-//
-// Test that IMG_Object image kind is set for AOT compilation (Intel CPU).
-// RUN: llvm-objdump --offloading %t-aot-cpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT
-//
-// Check that the output file must be specified.
-// RUN: not clang-sycl-linker --dry-run %t_1.bc %t_2.bc 2>&1 \
-// RUN: | FileCheck %s --check-prefix=NOOUTPUT
-// NOOUTPUT: Output file must be specified
-//
-// Check that the target triple must be specified.
-// RUN: not clang-sycl-linker --dry-run %t_1.bc %t_2.bc -o a.out 2>&1 \
-// RUN: | FileCheck %s --check-prefix=NOTARGET
-// NOTARGET: Target triple must be specified
-//
-// Test the split mode ("none"): no extra splits are produced.
-// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=none %t_1.bc %t_2.bc -o %t-split-none.out 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=SPLIT-NONE
-// SPLIT-NONE:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc  libfiles:  output: [[LLVMLINKOUT:.*]].bc
-// SPLIT-NONE-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
-// SPLIT-NONE-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv
-// SPLIT-NONE-NOT:  LLVM backend: input: {{.*}}.bc, output: {{.*}}_1.spv
-//
-// Test per-kernel split: a module with two SPIR_KERNEL functions produces two
-// device images.
-// RUN: llvm-as %S/Inputs/SYCL/two-kernels.ll -o %t-two.bc
-// RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=kernel %t-two.bc -o %t-split-kernel.out 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=SPLIT-KERNEL
-// SPLIT-KERNEL:      sycl-device-link: inputs: {{.*}}.bc  libfiles:  output: [[LLVMLINKOUT:.*]].bc
-// SPLIT-KERNEL-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[SPLIT0:.*]].bc, [[SPLIT1:.*]].bc, mode: kernel
-// SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT0]].bc, output: {{.*}}_0.spv
-// SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT1]].bc, output: {{.*}}_1.spv
-//
-// Test that an invalid split mode is rejected.
-// RUN: not clang-sycl-linker --dry-run -triple=spirv64 --module-split-mode=bogus %t_1.bc -o a.out 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=SPLIT-INVALID
-// SPLIT-INVALID: module-split-mode value isn't recognized: bogus
diff --git a/clang/test/Tooling/clang-sycl-linker-split-mode.ll b/clang/test/Tooling/clang-sycl-linker-split-mode.ll
new file mode 100644
index 0000000000000..2b4b1cee4e171
--- /dev/null
+++ b/clang/test/Tooling/clang-sycl-linker-split-mode.ll
@@ -0,0 +1,51 @@
+; Tests the clang-sycl-linker tool: device code splitting.
+;
+; REQUIRES: spirv-registered-target
+;
+; RUN: llvm-as %s -o %t.bc
+;
+; Test that an invalid split mode is rejected.
+; RUN: not clang-sycl-linker --dry-run -triple=spirv64 --module-split-mode=bogus %t.bc -o a.out 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=SPLIT-INVALID
+; SPLIT-INVALID: module-split-mode value isn't recognized: bogus
+;
+; Test the split mode ("none"): no extra splits are produced.
+; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=none %t.bc -o %t-none.out 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=SPLIT-NONE
+; SPLIT-NONE:      sycl-device-link: inputs: {{.*}}.bc  libfiles:  output: [[LLVMLINKOUT:.*]].bc
+; SPLIT-NONE-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
+; SPLIT-NONE-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv
+; SPLIT-NONE-NOT:  LLVM backend: input: {{.*}}.bc, output: {{.*}}_1.spv
+;
+; Test per-kernel split: a module with two SPIR_KERNEL functions produces two
+; device images.
+; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 --module-split-mode=kernel %t.bc -o %t-split-kernel.out 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=SPLIT-KERNEL
+; SPLIT-KERNEL:      sycl-device-link: inputs: {{.*}}.bc  libfiles:  output: [[LLVMLINKOUT:.*]].bc
+; SPLIT-KERNEL-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[SPLIT0:.*]].bc, [[SPLIT1:.*]].bc, mode: kernel
+; SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT0]].bc, output: {{.*}}_0.spv
+; SPLIT-KERNEL-NEXT: LLVM backend: input: [[SPLIT1]].bc, output: {{.*}}_1.spv
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
+target triple = "spirv64"
+
+define spir_func i32 @helper_shared(i32 %a) {
+entry:
+  %r = add nsw i32 %a, 1
+  ret i32 %r
+}
+
+define spir_kernel void @kernel_a(ptr addrspace(1) %out, i32 %a) {
+entry:
+  %r = tail call spir_func i32 @helper_shared(i32 %a)
+  store i32 %r, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define spir_kernel void @kernel_b(ptr addrspace(1) %out, i32 %a, i32 %b) {
+entry:
+  %h = tail call spir_func i32 @helper_shared(i32 %a)
+  %r = mul nsw i32 %h, %b
+  store i32 %r, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/clang/test/Tooling/clang-sycl-linker.ll b/clang/test/Tooling/clang-sycl-linker.ll
new file mode 100644
index 0000000000000..cf0fb33d1bc06
--- /dev/null
+++ b/clang/test/Tooling/clang-sycl-linker.ll
@@ -0,0 +1,97 @@
+; Tests the clang-sycl-linker tool.
+;
+; REQUIRES: spirv-registered-target
+;
+; RUN: rm -rf %t && split-file %s %t
+; RUN: llvm-as %t/input1.ll -o %t/input1.bc
+; RUN: llvm-as %t/input2.ll -o %t/input2.bc
+;
+; Test the dry run of a simple case to link two input files.
+; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t/input1.bc %t/input2.bc -o %t/spirv.out 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=SIMPLE-FO
+; SIMPLE-FO:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc  libfiles:  output: [[LLVMLINKOUT:.*]].bc
+; SIMPLE-FO-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
+; SIMPLE-FO-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: {{.*}}_0.spv
+;
+; Test that IMG_SPIRV image kind is set for non-AOT compilation.
+; RUN: llvm-objdump --offloading %t/spirv.out | FileCheck %s --check-prefix=IMAGE-KIND-SPIRV
+; IMAGE-KIND-SPIRV: kind            spir-v
+;
+; Test the dry run of a simple case with device library files specified.
+; RUN: mkdir -p %t/libs
+; RUN: touch %t/libs/lib1.bc
+; RUN: touch %t/libs/lib2.bc
+; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 %t/input1.bc %t/input2.bc --library-path=%t/libs --device-libs=lib1.bc,lib2.bc -o a.spv 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=DEVLIBS
+; DEVLIBS:      sycl-device-link: inputs: {{.*}}.bc  libfiles: {{.*}}lib1.bc, {{.*}}lib2.bc  output: [[LLVMLINKOUT:.*]].bc
+; DEVLIBS-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
+; DEVLIBS-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: a_0.spv
+;
+; Test a simple case with a random file (not bitcode) as input.
+; RUN: touch %t/dummy.o
+; RUN: not clang-sycl-linker -triple=spirv64 %t/dummy.o -o a.spv 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=FILETYPEERROR
+; FILETYPEERROR: Unsupported file type
+;
+; Test to see if device library related errors are emitted.
+; RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t/input1.bc %t/input2.bc --library-path=%t/libs --device-libs= -o a.spv 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=DEVLIBSERR1
+; DEVLIBSERR1: Number of device library files cannot be zero
+; RUN: not clang-sycl-linker --dry-run -triple=spirv64 %t/input1.bc %t/input2.bc --library-path=%t/libs --device-libs=lib1.bc,lib2.bc,lib3.bc -o a.spv 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=DEVLIBSERR2
+; DEVLIBSERR2: '{{.*}}lib3.bc' SYCL device library file is not found
+;
+; Test AOT compilation for an Intel GPU.
+; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=bmg_g21 %t/input1.bc %t/input2.bc -o %t/aot-gpu.out 2>&1 \
+; RUN:     --ocloc-options="-a -b" \
+; RUN:   | FileCheck %s --check-prefix=AOT-INTEL-GPU
+; AOT-INTEL-GPU:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc
+; AOT-INTEL-GPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
+; AOT-INTEL-GPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv
+; AOT-INTEL-GPU-NEXT: "{{.*}}ocloc{{.*}}" {{.*}}-device bmg_g21 -a -b {{.*}}-output [[SPIRVTRANSLATIONOUT]]_0.out -file [[SPIRVTRANSLATIONOUT]]_0.spv
+;
+; Test that IMG_Object image kind is set for AOT compilation (Intel GPU).
+; RUN: llvm-objdump --offloading %t/aot-gpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT
+; IMAGE-KIND-OBJECT: kind            elf
+;
+; Test AOT compilation for an Intel CPU.
+; RUN: clang-sycl-linker --dry-run -v -triple=spirv64 -arch=graniterapids %t/input1.bc %t/input2.bc -o %t/aot-cpu.out 2>&1 \
+; RUN:     --opencl-aot-options="-a -b" \
+; RUN:   | FileCheck %s --check-prefix=AOT-INTEL-CPU
+; AOT-INTEL-CPU:      sycl-device-link: inputs: {{.*}}.bc, {{.*}}.bc libfiles: output: [[LLVMLINKOUT:.*]].bc
+; AOT-INTEL-CPU-NEXT: sycl-module-split: input: [[LLVMLINKOUT]].bc, output: [[LLVMLINKOUT]].bc, mode: none
+; AOT-INTEL-CPU-NEXT: LLVM backend: input: [[LLVMLINKOUT]].bc, output: [[SPIRVTRANSLATIONOUT:.*]]_0.spv
+; AOT-INTEL-CPU-NEXT: "{{.*}}opencl-aot{{.*}}" {{.*}}--device=cpu -a -b {{.*}}-o [[SPIRVTRANSLATIONOUT]]_0.out [[SPIRVTRANSLATIONOUT]]_0.spv
+;
+; Test that IMG_Object image kind is set for AOT compilation (Intel CPU).
+; RUN: llvm-objdump --offloading %t/aot-cpu.out | FileCheck %s --check-prefix=IMAGE-KIND-OBJECT
+;
+; Check that the output file must be specified.
+; RUN: not clang-sycl-linker --dry-run %t/input1.bc %t/input2.bc 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=NOOUTPUT
+; NOOUTPUT: Output file must be specified
+;
+; Check that the target triple must be specified.
+; RUN: not clang-sycl-linker --dry-run %t/input1.bc %t/input2.bc -o a.out 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=NOTARGET
+; NOTARGET: Target triple must be specified
+
+;--- input1.ll
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
+target triple = "spirv64"
+
+define spir_kernel void @kernel_a() #0 {
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU1.cpp" }
+
+;--- input2.ll
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64-G1"
+target triple = "spirv64"
+
+define spir_kernel void @kernel_b() #0 {
+  ret void
+}
+
+attributes #0 = { "sycl-module-id"="TU2.cpp" }

From 16a1e05c02778a7b094a8c5343f13003b2e64851 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 13 May 2026 19:48:08 -0500
Subject: [PATCH 13/95] [lldb] Support building test inferiors without debug
 info (#197002)

Add first class support for building test inferiors without debug info,
instead of having to pass `-g0` in the Makefile or the build dictionary.

```
def test(self):
    self.build(debug_info="none")
```

rdar://164923931
---
 lldb/packages/Python/lldbsuite/test/builders/builder.py      | 1 +
 lldb/packages/Python/lldbsuite/test/make/Makefile.rules      | 4 ++++
 .../API/commands/frame/var-dil/basics/NoDebugInfo/Makefile   | 1 -
 .../var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py | 3 ++-
 .../objcxx/objc-from-cpp-frames-without-debuginfo/Makefile   | 1 -
 .../TestObjCFromCppFramesWithoutDebugInfo.py                 | 5 ++++-
 6 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/builders/builder.py b/lldb/packages/Python/lldbsuite/test/builders/builder.py
index 03c1af579b018..40db227607ee5 100644
--- a/lldb/packages/Python/lldbsuite/test/builders/builder.py
+++ b/lldb/packages/Python/lldbsuite/test/builders/builder.py
@@ -259,6 +259,7 @@ def _getDebugInfoArgs(self, debug_info):
             "debug_names": {"MAKE_DEBUG_NAMES": "YES"},
             "dwp": {"MAKE_DSYM": "NO", "MAKE_DWP": "YES"},
             "pdb": {"MAKE_PDB": "YES"},
+            "none": {"MAKE_DSYM": "NO", "MAKE_NO_DEBUG_INFO": "YES"},
         }
 
         # Collect all flags, with later options overriding earlier ones
diff --git a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
index 677124b8738f7..a3c5d94a570d1 100644
--- a/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
+++ b/lldb/packages/Python/lldbsuite/test/make/Makefile.rules
@@ -240,6 +240,10 @@ ifeq "$(OS)" "Windows_NT"
 	DEBUG_INFO_FLAG ?= -gdwarf
 endif
 
+ifeq "$(MAKE_NO_DEBUG_INFO)" "YES"
+	DEBUG_INFO_FLAG := -g0
+endif
+
 DEBUG_INFO_FLAG ?= -g
 
 CFLAGS ?= $(DEBUG_INFO_FLAG) -O0
diff --git a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile
index df9f4a7b518c7..99998b20bcb05 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile
+++ b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/Makefile
@@ -1,4 +1,3 @@
 CXX_SOURCES := main.cpp
-CFLAGS_EXTRAS := -g0
 
 include Makefile.rules
diff --git a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py
index defea39826267..10dbd3a6953f2 100644
--- a/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py
+++ b/lldb/test/API/commands/frame/var-dil/basics/NoDebugInfo/TestFrameVarDILNoDebugInfo.py
@@ -10,9 +10,10 @@
 
 class TestFrameVarDILNoDebugInfo(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
 
     def test_no_debug_info(self):
-        self.build()
+        self.build(debug_info="none")
         lldbutil.run_to_name_breakpoint(self, "main")
 
         self.runCmd("settings set target.experimental.use-DIL true")
diff --git a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile
index 7c3c32d6f82df..99998b20bcb05 100644
--- a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile
+++ b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/Makefile
@@ -1,4 +1,3 @@
 CXX_SOURCES := main.cpp
-CXXFLAGS_EXTRAS := -g0
 
 include Makefile.rules
diff --git a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py
index 497c0dd128f48..ddc7498a72e8d 100644
--- a/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py
+++ b/lldb/test/API/lang/objcxx/objc-from-cpp-frames-without-debuginfo/TestObjCFromCppFramesWithoutDebugInfo.py
@@ -10,8 +10,11 @@
 
 
 class TestObjCFromCppFramesWithoutDebugInfo(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+    SHARED_BUILD_TESTCASE = False
+
     def test(self):
-        self.build()
+        self.build(debug_info="none")
         (_, process, _, _) = lldbutil.run_to_name_breakpoint(self, "main")
 
         self.assertState(process.GetState(), lldb.eStateStopped)

From 93bfabb65773d7eff460632f84e5de0ab44dba9c Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Wed, 13 May 2026 20:08:26 -0500
Subject: [PATCH 14/95] [Offload] Make 'llvm-offload-binary' use multi-binaries
 (#197456)

Summary:
There's two ways you can put multiple binaries in the section. Either
use the version two multi-binary support or just concatenate them. This
PR changes the llvm-offload-binary tool to use the multi-support rather
than directly concatenating them.

The motivation for this is to save space and make it easier to support
compression in the future. Compression would be a flag in the header and
the compression is only really valuable if it can combine the
architecture variants. ELF section compression is a little spotty but
would be another good solution.
---
 .../llvm-offload-binary/llvm-offload-binary.cpp | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp b/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp
index 1b8ed02c8e6d0..1c429f2f85046 100644
--- a/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp
+++ b/llvm/tools/llvm-offload-binary/llvm-offload-binary.cpp
@@ -87,8 +87,7 @@ static Error writeFile(StringRef Filename, StringRef Data) {
 }
 
 static Error bundleImages() {
-  SmallVector<char, 1024> BinaryData;
-  raw_svector_ostream OS(BinaryData);
+  SmallVector<OffloadBinary::OffloadingImage> AllImages;
   for (StringRef Image : DeviceImages) {
     BumpPtrAllocator Alloc;
     StringSaver Saver(Alloc);
@@ -123,16 +122,16 @@ static Error bundleImages() {
           ImageBinary.StringData[Key] = Value;
         }
       }
-      llvm::SmallString<0> Buffer = OffloadBinary::write(ImageBinary);
-      if (Buffer.size() % OffloadBinary::getAlignment() != 0)
-        return createStringError(inconvertibleErrorCode(),
-                                 "Offload binary has invalid size alignment");
-      OS << Buffer;
+      AllImages.emplace_back(std::move(ImageBinary));
     }
   }
 
-  if (Error E = writeFile(OutputFile,
-                          StringRef(BinaryData.begin(), BinaryData.size())))
+  SmallString<0> Buffer = OffloadBinary::write(AllImages);
+  if (Buffer.size() % OffloadBinary::getAlignment() != 0)
+    return createStringError(inconvertibleErrorCode(),
+                             "Offload binary has invalid size alignment");
+
+  if (Error E = writeFile(OutputFile, StringRef(Buffer.data(), Buffer.size())))
     return E;
   return Error::success();
 }

From 18e73eedf6966e69dfc4cfa43fb9842e960e8497 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Wed, 13 May 2026 18:09:14 -0700
Subject: [PATCH 15/95] [llvm] Add a tablegen !sort operator (#197303)

This operator creates a new ``list`` containing the same elements as
*list*
but in sorted order. To determine the order, TableGen binds the variable
*var* to each element and evaluates the *key* expression, which
presumably
refers to *var*. The key must produce a ``string`` or integer value
(``bit``, ``bits``, or ``int``); all keys must be of the same type.
Elements
with equal keys preserve their original relative order, resulting in a
stable
sort.

For example, to sort a list of records by their ``Name`` field::

`  list<Thing> sorted = !sort(t, Things, t.Name);`
---
 llvm/docs/TableGen/ProgRef.rst      | 15 +++++
 llvm/include/llvm/TableGen/Record.h |  1 +
 llvm/lib/TableGen/Record.cpp        | 63 +++++++++++++++++++-
 llvm/lib/TableGen/TGLexer.cpp       |  1 +
 llvm/lib/TableGen/TGLexer.h         |  1 +
 llvm/lib/TableGen/TGParser.cpp      | 92 +++++++++++++++++++++--------
 llvm/lib/TableGen/TGParser.h        |  4 +-
 llvm/test/TableGen/sort.td          | 71 ++++++++++++++++++++++
 8 files changed, 219 insertions(+), 29 deletions(-)
 create mode 100644 llvm/test/TableGen/sort.td

diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index 1f42adaf6b6df..d417559ba31b3 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -2008,6 +2008,21 @@ and non-0 as true.
     This operator produces the size of the string, list, or dag *a*.
     The size of a DAG is the number of arguments; the operator does not count.
 
+``!sort(``\ *var*\ ``,`` *list*\ ``,`` *key*\ ``)``
+    This operator creates a new ``list`` containing the same elements as *list*
+    but in sorted order. To determine the order, TableGen binds the variable
+    *var* to each element and evaluates the *key* expression, which presumably
+    refers to *var*. The key must produce a ``string`` or integer value
+    (``bit``, ``bits``, or ``int``); all keys must be of the same type. Elements
+    with equal keys preserve their original relative order, resulting in a
+    stable sort.
+
+    For example, to sort a list of records by their ``Name`` field::
+
+    .. code-block:: text
+
+      list<Thing> sorted = !sort(t, Things, t.Name);
+
 ``!sra(``\ *a*\ ``,`` *count*\ ``)``
     This operator shifts *a* right arithmetically by *count* bits and produces the resulting
     value. The operation is performed on a 64-bit integer; the result
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index cb2721aba4f25..ce03995bc46c8 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -968,6 +968,7 @@ class TernOpInit final : public OpInit, public FoldingSetNode {
     FIND,
     SETDAGARG,
     SETDAGNAME,
+    SORT,
   };
 
 private:
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 3395d2dd10a1b..ce6c63560ed1a 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1788,6 +1788,57 @@ static const Init *FilterHelper(const Init *LHS, const Init *MHS,
   return nullptr;
 }
 
+static const Init *SortHelper(const Init *LHS, const Init *MHS, const Init *RHS,
+                              const RecTy *Type, const Record *CurRec) {
+  const auto *MHSl = dyn_cast<ListInit>(MHS);
+  if (!MHSl)
+    return nullptr;
+
+  RecordKeeper &RK = LHS->getRecordKeeper();
+  using KV = std::pair<const Init *, const Init *>;
+  SmallVector<KV, 8> KeyedList;
+
+  for (const Init *Item : MHSl->getElements()) {
+    const Init *Key = ItemApply(LHS, Item, RHS, CurRec);
+    if (!Key)
+      return nullptr;
+    KeyedList.emplace_back(Key, Item);
+  }
+
+  if (KeyedList.empty())
+    return ListInit::get({}, cast<ListRecTy>(Type)->getElementType());
+
+  // Determine key type from the first element; all keys must agree.
+  bool UseInt =
+      dyn_cast_or_null<IntInit>(KeyedList[0].first->convertInitializerTo(
+          IntRecTy::get(RK))) != nullptr;
+  for (auto &[Key, Item] : KeyedList) {
+    if (UseInt) {
+      if (!dyn_cast_or_null<IntInit>(
+              Key->convertInitializerTo(IntRecTy::get(RK))))
+        return nullptr;
+    } else {
+      if (!isa<StringInit>(Key))
+        return nullptr;
+    }
+  }
+
+  llvm::stable_sort(KeyedList, [&RK, UseInt](const KV &A, const KV &B) {
+    if (UseInt)
+      return cast<IntInit>(A.first->convertInitializerTo(IntRecTy::get(RK)))
+                 ->getValue() <
+             cast<IntInit>(B.first->convertInitializerTo(IntRecTy::get(RK)))
+                 ->getValue();
+    return cast<StringInit>(A.first)->getValue() <
+           cast<StringInit>(B.first)->getValue();
+  });
+
+  SmallVector<const Init *, 8> Result;
+  for (auto &[Key, Item] : KeyedList)
+    Result.push_back(Item);
+  return ListInit::get(Result, cast<ListRecTy>(Type)->getElementType());
+}
+
 const Init *TernOpInit::Fold(const Record *CurRec) const {
   RecordKeeper &RK = getRecordKeeper();
   switch (getOpcode()) {
@@ -1845,6 +1896,12 @@ const Init *TernOpInit::Fold(const Record *CurRec) const {
     break;
   }
 
+  case SORT: {
+    if (const Init *Result = SortHelper(LHS, MHS, RHS, getType(), CurRec))
+      return Result;
+    break;
+  }
+
   case IF: {
     if (const auto *LHSi = dyn_cast_or_null<IntInit>(
             LHS->convertInitializerTo(IntRecTy::get(RK)))) {
@@ -2004,7 +2061,7 @@ const Init *TernOpInit::resolveReferences(Resolver &R) const {
   const Init *mhs = MHS->resolveReferences(R);
   const Init *rhs;
 
-  if (getOpcode() == FOREACH || getOpcode() == FILTER) {
+  if (getOpcode() == FOREACH || getOpcode() == FILTER || getOpcode() == SORT) {
     ShadowResolver SR(R);
     SR.addShadow(lhs);
     rhs = RHS->resolveReferences(SR);
@@ -2025,6 +2082,10 @@ std::string TernOpInit::getAsString() const {
   case DAG: Result = "!dag"; break;
   case FILTER: Result = "!filter"; UnquotedLHS = true; break;
   case FOREACH: Result = "!foreach"; UnquotedLHS = true; break;
+  case SORT:
+    Result = "!sort";
+    UnquotedLHS = true;
+    break;
   case IF: Result = "!if"; break;
   case RANGE:
     Result = "!range";
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 3c88f107f790a..be642b30261c6 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -676,6 +676,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
           .Case("listsplat", tgtok::XListSplat)
           .Case("listremove", tgtok::XListRemove)
           .Case("range", tgtok::XRange)
+          .Case("sort", tgtok::XSort)
           .Case("strconcat", tgtok::XStrConcat)
           .Case("initialized", tgtok::XInitialized)
           .Case("interleave", tgtok::XInterleave)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index a0ade6412024e..4490ed55f37ef 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -156,6 +156,7 @@ enum TokKind {
   XToLower,
   XToUpper,
   XRange,
+  XSort,
   XGetDagArg,
   XGetDagName,
   XSetDagArg,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index c44e067a9da9f..8d9890cea18e7 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -1941,8 +1941,9 @@ const Init *TGParser::ParseOperation(Record *CurRec, const RecTy *ItemType) {
   }
 
   case tgtok::XForEach:
-  case tgtok::XFilter: {
-    return ParseOperationForEachFilter(CurRec, ItemType);
+  case tgtok::XFilter:
+  case tgtok::XSort: {
+    return ParseOperationListComprehension(CurRec, ItemType);
   }
 
   case tgtok::XRange: {
@@ -2571,12 +2572,13 @@ const Init *TGParser::ParseOperationFind(Record *CurRec,
   return (TernOpInit::get(Code, LHS, MHS, RHS, Type))->Fold(CurRec);
 }
 
-/// Parse the !foreach and !filter operations. Return null on error.
+/// Parse the !foreach, !filter, and !sort operations. Return null on error.
 ///
 /// ForEach ::= !foreach(ID, list-or-dag, expr) => list<expr type>
-/// Filter  ::= !foreach(ID, list, predicate) ==> list<list type>
-const Init *TGParser::ParseOperationForEachFilter(Record *CurRec,
-                                                  const RecTy *ItemType) {
+/// Filter  ::= !filter(ID, list, predicate) ==> list<list type>
+/// Sort    ::= !sort(ID, list, key-expr) ==> list<list type>
+const Init *TGParser::ParseOperationListComprehension(Record *CurRec,
+                                                      const RecTy *ItemType) {
   SMLoc OpLoc = Lex.getLoc();
   tgtok::TokKind Operation = Lex.getCode();
   Lex.Lex(); // eat the operation
@@ -2628,9 +2630,19 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec,
     InEltType = InListTy->getElementType();
     if (ItemType) {
       if (const auto *OutListTy = dyn_cast<ListRecTy>(ItemType)) {
-        ExprEltType = (Operation == tgtok::XForEach)
-                          ? OutListTy->getElementType()
-                          : IntRecTy::get(Records);
+        switch (Operation) {
+        case tgtok::XForEach:
+          ExprEltType = OutListTy->getElementType();
+          break;
+        case tgtok::XFilter:
+          ExprEltType = IntRecTy::get(Records);
+          break;
+        case tgtok::XSort:
+          ExprEltType = nullptr;
+          break;
+        default:
+          llvm_unreachable("unexpected token");
+        }
       } else {
         Error(OpLoc, "expected value of type '" +
                          Twine(ItemType->getAsString()) +
@@ -2639,9 +2651,17 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec,
       }
     }
   } else if (const auto *InDagTy = dyn_cast<DagRecTy>(MHSt->getType())) {
-    if (Operation == tgtok::XFilter) {
+    switch (Operation) {
+    case tgtok::XFilter:
       TokError("!filter must have a list argument");
       return nullptr;
+    case tgtok::XSort:
+      TokError("!sort must have a list argument");
+      return nullptr;
+    case tgtok::XForEach:
+      break;
+    default:
+      llvm_unreachable("unexpected token");
     }
     InEltType = InDagTy;
     if (ItemType && !isa<DagRecTy>(ItemType)) {
@@ -2651,11 +2671,19 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec,
     }
     IsDAG = true;
   } else {
-    if (Operation == tgtok::XForEach)
+    switch (Operation) {
+    case tgtok::XForEach:
       TokError("!foreach must have a list or dag argument");
-    else
+      return nullptr;
+    case tgtok::XFilter:
       TokError("!filter must have a list argument");
-    return nullptr;
+      return nullptr;
+    case tgtok::XSort:
+      TokError("!sort must have a list argument");
+      return nullptr;
+    default:
+      llvm_unreachable("unexpected token");
+    }
   }
 
   // We need to create a temporary record to provide a scope for the
@@ -2680,22 +2708,34 @@ const Init *TGParser::ParseOperationForEachFilter(Record *CurRec,
     return nullptr;
   }
 
-  const RecTy *OutType = InEltType;
-  if (Operation == tgtok::XForEach && !IsDAG) {
-    const auto *RHSt = dyn_cast<TypedInit>(RHS);
-    if (!RHSt) {
-      TokError("could not get type of !foreach result expression");
-      return nullptr;
+  const RecTy *OutType;
+  TernOpInit::TernaryOp Opc;
+  switch (Operation) {
+  case tgtok::XForEach:
+    Opc = TernOpInit::FOREACH;
+    if (IsDAG) {
+      OutType = InEltType;
+    } else {
+      const auto *RHSt = dyn_cast<TypedInit>(RHS);
+      if (!RHSt) {
+        TokError("could not get type of !foreach result expression");
+        return nullptr;
+      }
+      OutType = RHSt->getType()->getListTy();
     }
-    OutType = RHSt->getType()->getListTy();
-  } else if (Operation == tgtok::XFilter) {
+    break;
+  case tgtok::XFilter:
+    Opc = TernOpInit::FILTER;
+    OutType = InEltType->getListTy();
+    break;
+  case tgtok::XSort:
+    Opc = TernOpInit::SORT;
     OutType = InEltType->getListTy();
+    break;
+  default:
+    llvm_unreachable("unexpected token");
   }
-
-  return (TernOpInit::get((Operation == tgtok::XForEach) ? TernOpInit::FOREACH
-                                                         : TernOpInit::FILTER,
-                          LHS, MHS, RHS, OutType))
-      ->Fold(CurRec);
+  return (TernOpInit::get(Opc, LHS, MHS, RHS, OutType))->Fold(CurRec);
 }
 
 const Init *TGParser::ParseOperationCond(Record *CurRec,
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index 9f0b89f080c9e..3e7cd2c48b56a 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -326,8 +326,8 @@ class TGParser {
   const Init *ParseOperation(Record *CurRec, const RecTy *ItemType);
   const Init *ParseOperationSubstr(Record *CurRec, const RecTy *ItemType);
   const Init *ParseOperationFind(Record *CurRec, const RecTy *ItemType);
-  const Init *ParseOperationForEachFilter(Record *CurRec,
-                                          const RecTy *ItemType);
+  const Init *ParseOperationListComprehension(Record *CurRec,
+                                              const RecTy *ItemType);
   const Init *ParseOperationCond(Record *CurRec, const RecTy *ItemType);
   const RecTy *ParseOperatorType();
   const Init *ParseObjectName(MultiClass *CurMultiClass);
diff --git a/llvm/test/TableGen/sort.td b/llvm/test/TableGen/sort.td
new file mode 100644
index 0000000000000..29107a7e465a1
--- /dev/null
+++ b/llvm/test/TableGen/sort.td
@@ -0,0 +1,71 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+// RUN: not llvm-tblgen -DERROR_NONLIST %s 2>&1 | FileCheck --check-prefix=ERROR_NONLIST %s
+// RUN: not llvm-tblgen -DERROR_KEYTYPE %s 2>&1 | FileCheck --check-prefix=ERROR_KEYTYPE %s
+
+// Sort an already-sorted list — should be a no-op.
+// CHECK-LABEL: def idempotent
+// CHECK:   already = ["a", "b", "c"];
+def idempotent {
+  list<string> already = !sort(item, ["a", "b", "c"], item);
+}
+
+// Sort records by a string field or an int field. The sort is stable, so order
+// is preserved on elements with equal key.
+// CHECK-LABEL: def key_expr
+// CHECK:   by_name  = [thing_a20, thing_a10, thing_b20, thing_c30, thing_d10];
+// CHECK:   by_value = [thing_a10, thing_d10, thing_a20, thing_b20, thing_c30];
+class Thing<string N, int V> {
+  string Name = N;
+  int Value = V;
+}
+def thing_a10 : Thing<"alpha",   10>;
+def thing_a20 : Thing<"alpha",   20>;
+def thing_b20 : Thing<"beta",    20>;
+def thing_c30 : Thing<"charlie", 30>;
+def thing_d10 : Thing<"delta",   10>;
+defvar Things = [thing_c30, thing_a20, thing_a10, thing_b20, thing_d10];
+def key_expr {
+  list<Thing> by_name  = !sort(t, Things, t.Name);
+  list<Thing> by_value = !sort(t, Things, t.Value);
+}
+
+// Sort a single-element list.
+// CHECK-LABEL: def single
+// CHECK:   one = ["only"];
+def single {
+  list<string> one = !sort(item, ["only"], item);
+}
+
+// CHECK-LABEL: def sorted
+// CHECK:   sorted_strings = ["axolotl", "barracuda", "cephalopod", "dragonfly"];
+// CHECK:   sorted_ints = [1, 2, 3, 5, 8];
+// CHECK:   sorted_empty_strings = [];
+// CHECK:   sorted_empty_ints = [];
+defvar EmptyStrings = []<string>;
+defvar Creatures = ["cephalopod", "axolotl", "dragonfly", "barracuda"];
+defvar EmptyInts = []<int>;
+defvar Nums = [5, 2, 8, 1, 3];
+def sorted {
+  list<string> sorted_strings = !sort(item, Creatures, item);
+  list<int> sorted_ints = !sort(n, Nums, n);
+  list<string> sorted_empty_strings = !sort(item, EmptyStrings, item);
+  list<int> sorted_empty_ints = !sort(n, EmptyInts, n);
+}
+
+#ifdef ERROR_NONLIST
+// Dag is not a valid second argument.
+def myop;
+defvar mydag = (myop);
+def err_nonlist {
+  // ERROR_NONLIST: sort.td:[[@LINE+1]]:38: error: !sort must have a list argument
+  list<string> bad = !sort(x, mydag, !cast<string>(x));
+}
+#endif
+
+#ifdef ERROR_KEYTYPE
+// Key that cannot be resolved to int or string leaves the op unfolded.
+// ERROR_KEYTYPE: sort.td:[[@LINE+1]]:5: error: Initializer of 'bad' in 'err_keytype' could not be fully resolved
+def err_keytype {
+  list<Thing> bad = !sort(t, Things, t);
+}
+#endif

From 940242e622bffef3b9829c8b72011333359629e3 Mon Sep 17 00:00:00 2001
From: Junji Watanabe <jwata@google.com>
Date: Thu, 14 May 2026 10:28:29 +0900
Subject: [PATCH 16/95] [Windows][test] Fix "LLVM" test failures when
 LLVM_WINDOWS_PREFER_FORWARD_SLASH is ON (#184556)

This patch fixes several LLVM test failures on Windows that occur when
the LLVM_WINDOWS_PREFER_FORWARD_SLASH CMake option is enabled.

The failures were caused by tests either hardcoding backslash
expectations in FileCheck or constructing paths with strict backslashes
in C++ unit tests, both of which break when the environment is
configured to prefer forward slashes.

Specific changes:
- `llvm-cov` lit tests: Changed the path separators with
`-DSEP=%{fs-sep}`.
- `llvm-objdump` lit test: Relaxed
`source-interleave-prefix-windows.test` to accept either forward or
backward slashes using the `{{[/\\]}}` regex. This makes the path
matching resilient to the underlying separator preference without losing
precision.
- CommandLineTest.cpp: Conditionalized the TestRoot variable to use
`C:/` instead of `C:\` based on the build configuration.
- Path.cpp (makeLongFormPath test):
  - Updated the OneDir string literal to conditionally use `/` or `\`.
- Updated the ContainsDotAndDotDot lambda to check for `.` and `..`
components with the correct separator style based on the build
configuration.
---
 ...age.linux.test => directory_coverage.test} | 105 +++++++++---------
 .../llvm-cov/directory_coverage.win.test      |  44 --------
 llvm/test/tools/llvm-cov/native_separators.c  |  17 +--
 .../X86/source-interleave-prefix-windows.test |  12 +-
 llvm/unittests/Support/CommandLineTest.cpp    |   4 +-
 llvm/unittests/Support/Path.cpp               |   7 +-
 6 files changed, 79 insertions(+), 110 deletions(-)
 rename llvm/test/tools/llvm-cov/{directory_coverage.linux.test => directory_coverage.test} (53%)
 delete mode 100644 llvm/test/tools/llvm-cov/directory_coverage.win.test

diff --git a/llvm/test/tools/llvm-cov/directory_coverage.linux.test b/llvm/test/tools/llvm-cov/directory_coverage.test
similarity index 53%
rename from llvm/test/tools/llvm-cov/directory_coverage.linux.test
rename to llvm/test/tools/llvm-cov/directory_coverage.test
index 5db76c5ab833f..99dc0ccb82287 100644
--- a/llvm/test/tools/llvm-cov/directory_coverage.linux.test
+++ b/llvm/test/tools/llvm-cov/directory_coverage.test
@@ -1,53 +1,52 @@
-# REQUIRES: system-linux
-# RUN: mkdir -p %t
-
-# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \
-# RUN:   --instr-profile %S/Inputs/directory_coverage/main.profdata \
-# RUN:   --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \
-# RUN:   --format=text --show-directory-coverage -o %t/report-text
-
-# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \
-# RUN:   --instr-profile %S/Inputs/directory_coverage/main.profdata \
-# RUN:   --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \
-# RUN:   --format=html --show-directory-coverage -o %t/report-html
-
-# RUN: FileCheck --input-file %t/report-text/index.txt %s --check-prefix=ROOT
-# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/index.txt %s --check-prefix=ROOT
-# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/b0/index.txt %s --check-prefix=B0
-# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/c0/c1/index.txt %s --check-prefix=C1
-
-# RUN: FileCheck --input-file %t/report-html/index.html %s --check-prefix=HTML-TOP --allow-empty
-# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/index.html %s --check-prefix=ROOT
-# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/b0/index.html %s --check-prefix=B0
-# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/c0/c1/index.html %s --check-prefix=C1
-
-
-
-# HTML-TOP: coverage/index.html
-
-# ROOT: a0/a1/a2.cc
-# ROOT: b0/
-# ROOT-NOT: b1_1.cc
-# ROOT-NOT: b1_2.cc
-# ROOT: c0/c1/
-# ROOT-NOT: c2_1.cc
-# ROOT-NOT: b2_2.cc
-# ROOT: main.cc
-
-# B0: b1_1.cc
-# B0: b1_2.cc
-
-# C1: c2.h
-# C1: c2_1.cc
-
-
-For regenerating the test:
-
-cp -r %S/Inputs/directory_coverage /tmp
-cd /tmp/directory_coverage
-clang -fprofile-instr-generate -fcoverage-mapping -mllvm -enable-name-compression=false \
-  -o main main.cc a0/a1/a2.cc b0/b1_1.cc b0/b1_2.cc c0/c1/c2_1.cc c0/c1/c2_2.cc
-./main
-llvm-cov convert-for-testing main -o main.covmapping
-llvm-profdata merge default.profraw -o main.profdata
-rm main default.profraw
+# RUN: mkdir -p %t
+
+# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \
+# RUN:   --instr-profile %S/Inputs/directory_coverage/main.profdata \
+# RUN:   --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \
+# RUN:   --format=text --show-directory-coverage -o %t/report-text
+
+# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \
+# RUN:   --instr-profile %S/Inputs/directory_coverage/main.profdata \
+# RUN:   --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \
+# RUN:   --format=html --show-directory-coverage -o %t/report-html
+
+# RUN: FileCheck --input-file %t/report-text/index.txt %s --check-prefix=ROOT -DSEP=%{fs-sep}
+# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/index.txt %s --check-prefix=ROOT -DSEP=%{fs-sep}
+# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/b0/index.txt %s --check-prefix=B0 -DSEP=%{fs-sep}
+# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/c0/c1/index.txt %s --check-prefix=C1 -DSEP=%{fs-sep}
+
+# RUN: FileCheck --input-file %t/report-html/index.html %s --check-prefix=HTML-TOP --allow-empty -DSEP=%{fs-sep}
+# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/index.html %s --check-prefix=ROOT -DSEP=%{fs-sep}
+# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/b0/index.html %s --check-prefix=B0 -DSEP=%{fs-sep}
+# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/c0/c1/index.html %s --check-prefix=C1 -DSEP=%{fs-sep}
+
+
+
+# HTML-TOP: coverage[[SEP]]index.html
+
+# ROOT: a0[[SEP]]a1[[SEP]]a2.cc
+# ROOT: b0[[SEP]]
+# ROOT-NOT: b1_1.cc
+# ROOT-NOT: b1_2.cc
+# ROOT: c0[[SEP]]c1[[SEP]]
+# ROOT-NOT: c2_1.cc
+# ROOT-NOT: b2_2.cc
+# ROOT: main.cc
+
+# B0: b1_1.cc
+# B0: b1_2.cc
+
+# C1: c2.h
+# C1: c2_1.cc
+
+
+# The input of this test is generated on Linux.
+# For regenerating the test:
+# cp -r %S/Inputs/directory_coverage /tmp
+# cd /tmp/directory_coverage
+# clang -fprofile-instr-generate -fcoverage-mapping -mllvm -enable-name-compression=false \
+#   -o main main.cc a0/a1/a2.cc b0/b1_1.cc b0/b1_2.cc c0/c1/c2_1.cc c0/c1/c2_2.cc
+# ./main
+# llvm-cov convert-for-testing main -o main.covmapping
+# llvm-profdata merge default.profraw -o main.profdata
+# rm main default.profraw
diff --git a/llvm/test/tools/llvm-cov/directory_coverage.win.test b/llvm/test/tools/llvm-cov/directory_coverage.win.test
deleted file mode 100644
index f948bdcae3a58..0000000000000
--- a/llvm/test/tools/llvm-cov/directory_coverage.win.test
+++ /dev/null
@@ -1,44 +0,0 @@
-# REQUIRES: system-windows
-# RUN: mkdir -p %t
-
-# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \
-# RUN:   --instr-profile %S/Inputs/directory_coverage/main.profdata \
-# RUN:   --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \
-# RUN:   --format=text --show-directory-coverage -o %t/report-text
-
-# RUN: llvm-cov show %S/Inputs/directory_coverage/main.covmapping \
-# RUN:   --instr-profile %S/Inputs/directory_coverage/main.profdata \
-# RUN:   --path-equivalence=/tmp/directory_coverage,%S/Inputs/directory_coverage \
-# RUN:   --format=html --show-directory-coverage -o %t/report-html
-
-# RUN: FileCheck --input-file %t/report-text/index.txt %s --check-prefix=ROOT
-# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/index.txt %s --check-prefix=ROOT
-# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/b0/index.txt %s --check-prefix=B0
-# RUN: FileCheck --input-file %t/report-text/coverage/tmp/directory_coverage/c0/c1/index.txt %s --check-prefix=C1
-
-# RUN: FileCheck --input-file %t/report-html/index.html %s --check-prefix=HTML-TOP --allow-empty
-# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/index.html %s --check-prefix=ROOT
-# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/b0/index.html %s --check-prefix=B0
-# RUN: FileCheck --input-file %t/report-html/coverage/tmp/directory_coverage/c0/c1/index.html %s --check-prefix=C1
-
-
-
-# HTML-TOP: coverage\index.html
-
-# ROOT: a0\a1\a2.cc
-# ROOT: b0\
-# ROOT-NOT: b1_1.cc
-# ROOT-NOT: b1_2.cc
-# ROOT: c0\c1\
-# ROOT-NOT: c2_1.cc
-# ROOT-NOT: b2_2.cc
-# ROOT: main.cc
-
-# B0: b1_1.cc
-# B0: b1_2.cc
-
-# C1: c2.h
-# C1: c2_1.cc
-
-
-The input of this test is generated on Linux. See 'directory_coverage.linux.test'.
diff --git a/llvm/test/tools/llvm-cov/native_separators.c b/llvm/test/tools/llvm-cov/native_separators.c
index 3c768e1014b92..4fe305de4b13e 100644
--- a/llvm/test/tools/llvm-cov/native_separators.c
+++ b/llvm/test/tools/llvm-cov/native_separators.c
@@ -1,20 +1,21 @@
 // To create the covmapping for this file on Linux, copy this file to /tmp
 // cd into /tmp. Use llvm-cov convert-for-testing to extract the covmapping.
 // This test is Windows-only. It checks that all paths, which are generated
-// in the index and source coverage reports, are native path. For example,
-// on Windows all '/' are converted to '\'.
+// in the index and source coverage reports, are native paths. For example,
+// on Windows all '/' are converted to the native separator, the direction
+// of which is controlled by LLVM_WINDOWS_PREFER_FORWARD_SLASH.
 // REQUIRES: system-windows
 
 // RUN: llvm-profdata merge %S/Inputs/double_dots.proftext -o %t.profdata
 // RUN: llvm-cov show %S/Inputs/native_separators.covmapping -instr-profile=%t.profdata -o %t.dir
-// RUN: FileCheck -check-prefixes=TEXT-INDEX -input-file=%t.dir/index.txt %s
+// RUN: FileCheck -check-prefixes=TEXT-INDEX -input-file=%t.dir/index.txt -DSEP=%{fs-sep} %s
 // RUN: llvm-cov show -format=html %S/Inputs/native_separators.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp,%S %S/../llvm-"config"/../llvm-"cov"/native_separators.c -o %t.dir
-// RUN: FileCheck -check-prefixes=HTML-INDEX -input-file=%t.dir/index.html %s
+// RUN: FileCheck -check-prefixes=HTML-INDEX -input-file=%t.dir/index.html -DSEP=%{fs-sep} %s
 // RUN: llvm-cov show -format=html %S/Inputs/native_separators.covmapping -instr-profile=%t.profdata -path-equivalence=/tmp,%S %s -o %t.dir
-// RUN: FileCheck -check-prefixes=HTML -input-file=%t.dir/coverage/tmp/native_separators.c.html %s
+// RUN: FileCheck -check-prefixes=HTML -input-file=%t.dir/coverage/tmp/native_separators.c.html -DSEP=%{fs-sep} %s
 
-// TEXT-INDEX: \tmp\native_separators.c
-// HTML-INDEX: >tmp\native_separators.c</a>
-// HTML: <pre>\tmp\native_separators.c</pre>
+// TEXT-INDEX: [[SEP]]tmp[[SEP]]native_separators.c
+// HTML-INDEX: >tmp[[SEP]]native_separators.c</a>
+// HTML: <pre>[[SEP]]tmp[[SEP]]native_separators.c</pre>
 
 int main() {}
diff --git a/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test b/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test
index 0f8952daec42e..f75ba94353b68 100644
--- a/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test
+++ b/llvm/test/tools/llvm-objdump/X86/source-interleave-prefix-windows.test
@@ -6,6 +6,14 @@
 
 ; RUN: sed -e "s,SRC_COMPDIR,/Inputs,g" %p/Inputs/source-interleave.ll > %t.ll
 ; RUN: llc -o %t.o -filetype=obj -mtriple=x86_64-pc-linux %t.ll
-; RUN: llvm-objdump --prefix 'myprefix/\' --source %t.o 2>&1 | FileCheck %s -DFILE=%t.o -DPREFIX='myprefix'
-; CHECK: warning: '[[FILE]]': failed to find source [[PREFIX]]/Inputs\source-interleave-x86_64.c
+; RUN: llvm-objdump --prefix 'myprefix/\' --source %t.o 2>&1 | FileCheck %s -DFILE=%t.o -DPREFIX='myprefix' -DSEP=%{fs-sep}
+
+;; When --prefix is specified and the file path is absolute, sys::path::append
+;; is used to add the file path to the prefix. On Windows, if the file path
+;; starts with a slash, sys::path::append currently does a straight
+;; concatenation, resulting in the first slash being preserved as-is ('/').
+;; The second slash (after 'Inputs') is generated by llc using
+;; sys::path::append to join the directory and filename, which uses the
+;; preferred path separator (controlled by LLVM_WINDOWS_PREFER_FORWARD_SLASH).
+; CHECK: warning: '[[FILE]]': failed to find source [[PREFIX]]/Inputs[[SEP]]source-interleave-x86_64.c
 
diff --git a/llvm/unittests/Support/CommandLineTest.cpp b/llvm/unittests/Support/CommandLineTest.cpp
index fca2d298c460e..f2effbdddfbbf 100644
--- a/llvm/unittests/Support/CommandLineTest.cpp
+++ b/llvm/unittests/Support/CommandLineTest.cpp
@@ -1015,7 +1015,7 @@ TEST(CommandLineTest, ResponseFiles) {
 TEST(CommandLineTest, RecursiveResponseFiles) {
   vfs::InMemoryFileSystem FS;
 #ifdef _WIN32
-  const char *TestRoot = "C:\\";
+  const char *TestRoot = LLVM_WINDOWS_PREFER_FORWARD_SLASH ? "C:/" : "C:\\";
 #else
   const char *TestRoot = "/";
 #endif
@@ -1085,7 +1085,7 @@ TEST(CommandLineTest, RecursiveResponseFiles) {
 TEST(CommandLineTest, ResponseFilesAtArguments) {
   vfs::InMemoryFileSystem FS;
 #ifdef _WIN32
-  const char *TestRoot = "C:\\";
+  const char *TestRoot = LLVM_WINDOWS_PREFER_FORWARD_SLASH ? "C:/" : "C:\\";
 #else
   const char *TestRoot = "/";
 #endif
diff --git a/llvm/unittests/Support/Path.cpp b/llvm/unittests/Support/Path.cpp
index b196dc1d5452b..bb825a1b1e65c 100644
--- a/llvm/unittests/Support/Path.cpp
+++ b/llvm/unittests/Support/Path.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/BinaryFormat/Magic.h"
+#include "llvm/Config/config.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -2792,7 +2793,9 @@ TEST_F(FileSystemTest, makeLongFormPath) {
 
   // Setup: A test directory longer than 8 characters for which a distinct
   // short 8.3 form name will be created on Windows. Typically, 123456~1.
-  constexpr const char *OneDir = "\\123456789"; // >8 chars
+  const char *OneDir = LLVM_WINDOWS_PREFER_FORWARD_SLASH
+                           ? "/123456789"
+                           : "\\123456789"; // >8 chars
 
   // Setup: Create a path where even if all components were reduced to short 8.3
   // form names, the total length would exceed MAX_PATH.
@@ -2825,6 +2828,8 @@ TEST_F(FileSystemTest, makeLongFormPath) {
   ASSERT_FALSE(DotAndDotDot.empty())
       << "Expected short 8.3 form path for test directory.";
   auto ContainsDotAndDotDot = [](llvm::StringRef S) {
+    if (LLVM_WINDOWS_PREFER_FORWARD_SLASH)
+      return S.contains("/./") && S.contains("/../");
     return S.contains("\\.\\") && S.contains("\\..\\");
   };
   ASSERT_TRUE(ContainsDotAndDotDot(DotAndDotDot))

From 33f7918d29343fb9dec2b5fa96ee79fd05d36df8 Mon Sep 17 00:00:00 2001
From: Kai Nacke <kai.peter.nacke@ibm.com>
Date: Wed, 13 May 2026 22:21:10 -0400
Subject: [PATCH 17/95] [PowerPC] Simplify lowering for lwat/ldat intrinsics
 (#194486)

This change defines 4 new output patterns, `PAIR8`,`EVEN8`, `AEXT8`, and
`TRUNC4`, and uses them to implement the lowering of the intrinsics
`int_ppc_amo_l[dw]at` and `int_ppc_amo_l[dw]at_cond` in TableGen. As
result, the output pattern to generate the instructions becomes more
understandable,, and the C++ code can be removed.
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 69 ---------------------
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td    | 16 ++---
 llvm/lib/Target/PowerPC/PPCInstrInfo.td     | 33 +++++++---
 3 files changed, 27 insertions(+), 91 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 68958a8cf32d5..d375a52884b80 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14814,75 +14814,6 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         .addReg(Ptr);
     break;
   }
-  case PPC::LWAT_PSEUDO:
-  case PPC::LDAT_PSEUDO: {
-    DebugLoc DL = MI.getDebugLoc();
-    Register DstReg = MI.getOperand(0).getReg();
-    Register PtrReg = MI.getOperand(1).getReg();
-    Register ValReg = MI.getOperand(2).getReg();
-    unsigned FC = MI.getOperand(3).getImm();
-    bool IsLwat = MI.getOpcode() == PPC::LWAT_PSEUDO;
-    Register Val64 = MRI.createVirtualRegister(&PPC::G8RCRegClass);
-    if (IsLwat)
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::SUBREG_TO_REG), Val64)
-          .addReg(ValReg)
-          .addImm(PPC::sub_32);
-    else
-      Val64 = ValReg;
-
-    Register G8rPair = MRI.createVirtualRegister(&PPC::G8pRCRegClass);
-    Register UndefG8r = MRI.createVirtualRegister(&PPC::G8RCRegClass);
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), UndefG8r);
-    BuildMI(*BB, MI, DL, TII->get(PPC::REG_SEQUENCE), G8rPair)
-        .addReg(UndefG8r)
-        .addImm(PPC::sub_gp8_x0)
-        .addReg(Val64)
-        .addImm(PPC::sub_gp8_x1);
-
-    Register PairResult = MRI.createVirtualRegister(&PPC::G8pRCRegClass);
-    BuildMI(*BB, MI, DL, TII->get(IsLwat ? PPC::LWAT : PPC::LDAT), PairResult)
-        .addReg(G8rPair)
-        .addReg(PtrReg)
-        .addImm(FC);
-    Register Result64 = MRI.createVirtualRegister(&PPC::G8RCRegClass);
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), Result64)
-        .addReg(PairResult, {}, PPC::sub_gp8_x0);
-    if (IsLwat)
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg)
-          .addReg(Result64, {}, PPC::sub_32);
-    else
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg)
-          .addReg(Result64);
-    break;
-  }
-  case PPC::LWAT_COND_PSEUDO:
-  case PPC::LDAT_COND_PSEUDO: {
-    DebugLoc DL = MI.getDebugLoc();
-    Register DstReg = MI.getOperand(0).getReg();
-    Register PtrReg = MI.getOperand(1).getReg();
-    unsigned FC = MI.getOperand(2).getImm();
-    bool IsLwat_Cond = MI.getOpcode() == PPC::LWAT_COND_PSEUDO;
-
-    Register Pair = MRI.createVirtualRegister(&PPC::G8pRCRegClass);
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Pair);
-
-    Register PairResult = MRI.createVirtualRegister(&PPC::G8pRCRegClass);
-    BuildMI(*BB, MI, DL, TII->get(IsLwat_Cond ? PPC::LWAT : PPC::LDAT),
-            PairResult)
-        .addReg(Pair)
-        .addReg(PtrReg)
-        .addImm(FC);
-    Register Result64 = MRI.createVirtualRegister(&PPC::G8RCRegClass);
-    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), Result64)
-        .addReg(PairResult, {}, PPC::sub_gp8_x0);
-    if (IsLwat_Cond)
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg)
-          .addReg(Result64, {}, PPC::sub_32);
-    else
-      BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), DstReg)
-          .addReg(Result64);
-    break;
-  }
   default:
     llvm_unreachable("Unexpected instr type to insert");
   }
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 3c130077f3988..294297645e166 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -348,19 +348,11 @@ def LDAT_CSNE : X_RD5_RS5_IM5<31, 614, (outs g8rc:$RST), (ins ptr_rc_nor0:$RA),
            Requires<[IsISA3_0]>;
 }
 
-def LDAT_PSEUDO : PPCCustomInserterPseudo<
-    (outs g8rc:$dst),
-    (ins ptr_rc_nor0:$ptr, g8rc:$val, u5imm:$fc),
-    "#LDAT_PSEUDO",
-    [(set i64:$dst, (int_ppc_amo_ldat ptr_rc_nor0:$ptr, g8rc:$val,
-                                      u5imm_timm:$fc))]>;
+def : Pat<(int_ppc_amo_ldat ptr_rc_nor0:$ptr, g8rc:$val, u5imm_timm:$fc),
+          (EVEN8 (LDAT (PAIR8 (i64 (IMPLICIT_DEF)), $val), $ptr, $fc))>;
 
-def LDAT_COND_PSEUDO : PPCCustomInserterPseudo <
-    (outs g8rc:$dst),
-    (ins ptr_rc_nor0:$ptr, u5imm:$fc),
-    "#LDAT_COND_PSEUDO",
-    [(set i64:$dst, (int_ppc_amo_ldat_cond ptr_rc_nor0:$ptr,
-                                            u5imm_timm:$fc))]>;
+def : Pat<(int_ppc_amo_ldat_cond ptr_rc_nor0:$ptr, u5imm_timm:$fc),
+          (EVEN8 (LDAT (PAIR8 (i64 (IMPLICIT_DEF)), (i64 (IMPLICIT_DEF))), $ptr, $fc))>;
 
 let Defs = [X8, X9, X10], Uses = [X9, X10] in
 def LDAT_CSNE_PSEUDO : PPCPostRAExpPseudo<
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index eb4099b532336..926a184246dd5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -962,6 +962,24 @@ class BinOpWithoutSImm16Operand<SDNode opcode> :
 def add_without_simm16 : BinOpWithoutSImm16Operand<add>;
 def mul_without_simm16 : BinOpWithoutSImm16Operand<mul>;
 
+//===----------------------------------------------------------------------===//
+// Output pattern fragments.
+//
+
+// Create an even/odd register pair.
+def PAIR8 : OutPatFrag<(ops node:$even, node:$odd),
+                       (REG_SEQUENCE G8pRC, $even, sub_gp8_x0, $odd, sub_gp8_x1)>;
+
+// Return the even part of an even/odd register pair.
+def EVEN8 : OutPatFrag<(ops node:$pair), (EXTRACT_SUBREG $pair, sub_gp8_x0)>;
+
+// Any-extend a 32-bit value in GPRC to a 64-bit value in G8RC.
+def AEXT8 : OutPatFrag<(ops node:$r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $r, sub_32)>;
+
+// Truncate a 64-bit value in a G8RC value to 32-bit value in GPRC.
+def TRUNC4 : OutPatFrag<(ops node:$r), (EXTRACT_SUBREG $r, sub_32)>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Flag Definitions.
 
@@ -2129,17 +2147,12 @@ def LWAT_CSNE : X_RD5_RS5_IM5<31, 582, (outs g8rc:$RST), (ins ptr_rc_nor0:$RA),
                          "lwat $RST, $RA, 16", IIC_LdStLoad>,
            Requires<[IsISA3_0]>;
 
-def LWAT_PSEUDO : PPCCustomInserterPseudo<
-    (outs gprc:$dst),
-    (ins ptr_rc_nor0:$ptr, gprc:$val, u5imm:$fc),
-    "#LWAT_PSEUDO",
-    [(set i32:$dst, (int_ppc_amo_lwat ptr_rc_nor0:$ptr, gprc:$val, u5imm_timm:$fc))]>;
+def : Pat<(int_ppc_amo_lwat ptr_rc_nor0:$ptr, gprc:$val, u5imm_timm:$fc),
+          (TRUNC4 (LWAT (PAIR8 (i64 (IMPLICIT_DEF)), (AEXT8 $val)), $ptr, $fc))>;
 
-def LWAT_COND_PSEUDO : PPCCustomInserterPseudo <
-    (outs gprc:$dst),
-    (ins ptr_rc_nor0:$ptr, u5imm:$fc),
-    "#LWAT_COND_PSEUDO",
-    [(set i32:$dst, (int_ppc_amo_lwat_cond ptr_rc_nor0:$ptr, u5imm_timm:$fc))]>;
+def : Pat<(int_ppc_amo_lwat_cond ptr_rc_nor0:$ptr, u5imm_timm:$fc),
+          (TRUNC4 (LWAT (PAIR8 (i64 (IMPLICIT_DEF)), (i64 (IMPLICIT_DEF))),
+                        $ptr, $fc))>;
 
 let Defs = [R8, R9, R10], Uses = [R9, R10] in
 def LWAT_CSNE_PSEUDO : PPCPostRAExpPseudo<

From f996980ff486a0cc03b84f87bbb3554ea99be1e2 Mon Sep 17 00:00:00 2001
From: Akhil Goel <akhil.goel@intel.com>
Date: Wed, 13 May 2026 19:25:38 -0700
Subject: [PATCH 18/95] [X86][AVX10.2] Add BF16 to (U/S)8 saturating FP to int
 lowering  (#197096)

This PR adds BF16 to I8 saturating FP to int convert custom lowering.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 23 +++++++++-
 .../CodeGen/X86/avx10_2_512fptosi_satcvtds.ll | 22 ++++++++++
 .../CodeGen/X86/avx10_2fptosi_satcvtds.ll     | 44 +++++++++++++++++++
 3 files changed, 88 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa5b864df5936..3ca4e85b671cd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -321,6 +321,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     }
   }
   if (Subtarget.hasAVX10_2()) {
+    for (MVT VT : {MVT::v8i8, MVT::v16i8, MVT::v32i8}) {
+      setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom);
+      setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom);
+    }
     setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v2i32, Custom);
     setOperationAction(ISD::FP_TO_SINT_SAT, MVT::v2i32, Custom);
     setOperationAction(ISD::FP_TO_UINT_SAT, MVT::v8i64, Legal);
@@ -22785,13 +22789,21 @@ X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const {
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
   EVT TmpVT = DstVT;
+  EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+
+  if (Subtarget.hasAVX10_2() && SrcVT.isVector() &&
+      SrcVT.getVectorElementType() == MVT::bf16 && SatVT == MVT::i8) {
+    MVT VecI16VT = SrcVT.getSimpleVT().changeVectorElementType(MVT::i16);
+    SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2IBS : X86ISD::CVTTP2IUBS,
+                              dl, VecI16VT, Src);
+    return DAG.getNode(ISD::TRUNCATE, dl, DstVT, Res);
+  }
 
   // This code is only for floats and doubles. Fall back to generic code for
   // anything else.
   if (!isScalarFPTypeInSSEReg(SrcVT) || isBF16orSoftF16(SrcVT, Subtarget))
     return SDValue();
 
-  EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
   unsigned SatWidth = SatVT.getScalarSizeInBits();
   unsigned DstWidth = DstVT.getScalarSizeInBits();
   unsigned TmpWidth = TmpVT.getScalarSizeInBits();
@@ -35343,8 +35355,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     EVT VT = N->getValueType(0);
     SDValue Op = N->getOperand(0);
     EVT OpVT = Op.getValueType();
+    EVT SatVT = cast<VTSDNode>(N->getOperand(1))->getVT();
     SDValue Res;
 
+    if (VT == MVT::v8i8 && OpVT == MVT::v8bf16 && SatVT == MVT::i8) {
+      Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2IBS : X86ISD::CVTTP2IUBS, dl,
+                        MVT::v8i16, Op);
+      Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+      Results.push_back(Res);
+      return;
+    }
+
     if (VT == MVT::v2i32 && OpVT == MVT::v2f64) {
       if (IsSigned)
         Res = DAG.getNode(X86ISD::FP_TO_SINT_SAT, dl, MVT::v4i32, Op);
diff --git a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
index 827570e7311c7..6a3acf36bb1f2 100644
--- a/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2_512fptosi_satcvtds.ll
@@ -217,3 +217,25 @@ define <8 x i64> @test_unsigned_v8i64_v8f32(<8 x float> %f) nounwind {
   %x = call  <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> %f)
   ret <8 x i64> %x
 }
+
+; VCVTTBF162IUBS
+define <32 x i8> @test_unsigned_v32i8_v32bf16(<32 x bfloat> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v32i8_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttbf162iubs %zmm0, %zmm0
+; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <32 x i8> @llvm.fptoui.sat.v32i8.v32bf16(<32 x bfloat> %f)
+  ret <32 x i8> %x
+}
+
+; VCVTTBF162IBS
+define <32 x i8> @test_signed_v32i8_v32bf16(<32 x bfloat> %f) nounwind {
+; CHECK-LABEL: test_signed_v32i8_v32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttbf162ibs %zmm0, %zmm0
+; CHECK-NEXT:    vpmovwb %zmm0, %ymm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <32 x i8> @llvm.fptosi.sat.v32i8.v32bf16(<32 x bfloat> %f)
+  ret <32 x i8> %x
+}
diff --git a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
index 3d79457eb2a8a..1950de32cc975 100644
--- a/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
+++ b/llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll
@@ -674,3 +674,47 @@ define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind {
   %x = call  <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> %f)
   ret <4 x i64> %x
 }
+
+; VCVTTBF162IUBS
+define <8 x i8> @test_unsigned_v8i8_v8bf16(<8 x bfloat> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v8i8_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttbf162iubs %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i8> @llvm.fptoui.sat.v8i8.v8bf16(<8 x bfloat> %f)
+  ret <8 x i8> %x
+}
+
+define <16 x i8> @test_unsigned_v16i8_v16bf16(<16 x bfloat> %f) nounwind {
+; CHECK-LABEL: test_unsigned_v16i8_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttbf162iubs %ymm0, %ymm0
+; CHECK-NEXT:    vpmovwb %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <16 x i8> @llvm.fptoui.sat.v16i8.v16bf16(<16 x bfloat> %f)
+  ret <16 x i8> %x
+}
+
+; VCVTTBF162IBS
+define <8 x i8> @test_signed_v8i8_v8bf16(<8 x bfloat> %f) nounwind {
+; CHECK-LABEL: test_signed_v8i8_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttbf162ibs %xmm0, %xmm0
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <8 x i8> @llvm.fptosi.sat.v8i8.v8bf16(<8 x bfloat> %f)
+  ret <8 x i8> %x
+}
+
+define <16 x i8> @test_signed_v16i8_v16bf16(<16 x bfloat> %f) nounwind {
+; CHECK-LABEL: test_signed_v16i8_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vcvttbf162ibs %ymm0, %ymm0
+; CHECK-NEXT:    vpmovwb %ymm0, %xmm0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    ret{{[l|q]}}
+  %x = call  <16 x i8> @llvm.fptosi.sat.v16i8.v16bf16(<16 x bfloat> %f)
+  ret <16 x i8> %x
+}

From 7c7a47e4be48c8bc2bebd792454f5a5b8f702bbc Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he@intel.com>
Date: Thu, 14 May 2026 10:57:22 +0800
Subject: [PATCH 19/95] [Clang][HLSL] Use EmitIntrinsicCall instead of
 EmitRuntimeCall for intrinsic (#197380)

Fix HLSL builtin to SPIR-V intrinsic lowering: most intrinsics calls
must use CallingConv::C.

Relates to #197608 which tries to add CallingConv CHECK to IR Verifier.
---
 clang/lib/CodeGen/CGCall.cpp                  |  24 ++++
 clang/lib/CodeGen/CGHLSLBuiltins.cpp          | 116 +++++++-----------
 clang/lib/CodeGen/CodeGenFunction.h           |   9 ++
 .../builtins/AllMemoryBarrier.hlsl            |   2 +-
 .../AllMemoryBarrierWithGroupSync.hlsl        |   2 +-
 .../builtins/DeviceMemoryBarrier.hlsl         |   2 +-
 .../DeviceMemoryBarrierWithGroupSync.hlsl     |   2 +-
 .../builtins/GroupMemoryBarrier.hlsl          |   2 +-
 .../GroupMemoryBarrierWithGroupSync.hlsl      |   2 +-
 .../CodeGenHLSL/builtins/QuadReadAcrossX.hlsl |  88 ++++++-------
 .../CodeGenHLSL/builtins/QuadReadAcrossY.hlsl |  82 ++++++-------
 .../builtins/WaveActiveAllEqual.hlsl          |   6 +-
 .../builtins/WaveActiveAllTrue.hlsl           |   2 +-
 .../builtins/WaveActiveAnyTrue.hlsl           |   2 +-
 .../builtins/WaveActiveBallot.hlsl            |   2 +-
 .../builtins/WaveActiveBitAnd.hlsl            |  22 ++--
 .../CodeGenHLSL/builtins/WaveActiveBitOr.hlsl |  22 ++--
 .../builtins/WaveActiveBitXor.hlsl            |  22 ++--
 .../CodeGenHLSL/builtins/WaveActiveMax.hlsl   |   6 +-
 .../CodeGenHLSL/builtins/WaveActiveMin.hlsl   |   6 +-
 .../builtins/WaveActiveProduct.hlsl           |   6 +-
 .../CodeGenHLSL/builtins/WaveActiveSum.hlsl   |   6 +-
 .../builtins/WavePrefixCountBits.hlsl         |   2 +-
 .../builtins/WavePrefixProduct.hlsl           |   6 +-
 .../CodeGenHLSL/builtins/WavePrefixSum.hlsl   |   6 +-
 .../CodeGenHLSL/builtins/WaveReadLaneAt.hlsl  |  18 +--
 .../builtins/wave_get_lane_count.hlsl         |   4 +-
 .../builtins/wave_is_first_lane.hlsl          |   4 +-
 28 files changed, 237 insertions(+), 236 deletions(-)

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index a2b9c945788ee..1b420049fffc1 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5240,6 +5240,30 @@ llvm::CallInst *CodeGenFunction::EmitRuntimeCall(llvm::FunctionCallee callee,
   return call;
 }
 
+llvm::CallInst *CodeGenFunction::EmitIntrinsicCall(llvm::Intrinsic::ID ID,
+                                                   const llvm::Twine &Name) {
+  return EmitIntrinsicCall(ID, {}, {}, Name);
+}
+
+llvm::CallInst *CodeGenFunction::EmitIntrinsicCall(llvm::Intrinsic::ID ID,
+                                                   ArrayRef<llvm::Value *> Args,
+                                                   const llvm::Twine &Name) {
+  return EmitIntrinsicCall(ID, {}, Args, Name);
+}
+
+llvm::CallInst *CodeGenFunction::EmitIntrinsicCall(llvm::Intrinsic::ID ID,
+                                                   ArrayRef<llvm::Type *> Types,
+                                                   ArrayRef<llvm::Value *> Args,
+                                                   const llvm::Twine &Name) {
+  llvm::Function *F =
+      llvm::Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID, Types);
+  llvm::CallInst *Call =
+      Builder.CreateCall(F, Args, getBundlesForFunclet(F), Name);
+  if (CGM.shouldEmitConvergenceTokens() && Call->isConvergent())
+    return cast<llvm::CallInst>(addConvergenceControlToken(Call));
+  return Call;
+}
+
 /// Emits a call or invoke to the given noreturn runtime function.
 void CodeGenFunction::EmitNoreturnRuntimeCallOrInvoke(
     llvm::FunctionCallee callee, ArrayRef<llvm::Value *> args) {
diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
index 82b03d7d5f069..a4cd28f97b6d6 100644
--- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp
+++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp
@@ -172,9 +172,8 @@ static Value *handleHlslWaveActiveBallot(CodeGenFunction &CGF,
 
   if (CGF.CGM.getTarget().getTriple().isDXIL()) {
     // Call DXIL intrinsic: returns { i32, i32, i32, i32 }
-    llvm::Function *Fn = CGF.CGM.getIntrinsic(Intrinsic::dx_wave_ballot, {I32});
-
-    Value *StructVal = CGF.EmitRuntimeCall(Fn, Cond);
+    Value *StructVal =
+        CGF.EmitIntrinsicCall(Intrinsic::dx_wave_ballot, {I32}, {Cond});
     assert(StructVal->getType() == Struct4I32 &&
            "dx.wave.ballot must return {i32,i32,i32,i32}");
 
@@ -190,8 +189,7 @@ static Value *handleHlslWaveActiveBallot(CodeGenFunction &CGF,
   }
 
   if (CGF.CGM.getTarget().getTriple().isSPIRV())
-    return CGF.EmitRuntimeCall(
-        CGF.CGM.getIntrinsic(Intrinsic::spv_subgroup_ballot), Cond);
+    return CGF.EmitIntrinsicCall(Intrinsic::spv_subgroup_ballot, {Cond});
 
   llvm_unreachable(
       "WaveActiveBallot is only supported for DXIL and SPIRV targets");
@@ -1288,9 +1286,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Intrinsic::ID IID =
         getPrefixCountBitsIntrinsic(getTarget().getTriple().getArch());
 
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), IID), ArrayRef{Op},
-        "hlsl.wave.prefix.bit.count");
+    return EmitIntrinsicCall(IID, ArrayRef{Op}, "hlsl.wave.prefix.bit.count");
   }
   case Builtin::BI__builtin_hlsl_select: {
     Value *OpCond = EmitScalarExpr(E->getArg(0));
@@ -1335,9 +1331,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Value *Op = EmitScalarExpr(E->getArg(0));
 
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveAllEqualIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), ID, {Op->getType()}),
-                           {Op});
+    return EmitIntrinsicCall(ID, {Op->getType()}, {Op});
   }
   case Builtin::BI__builtin_hlsl_wave_active_all_true: {
     Value *Op = EmitScalarExpr(E->getArg(0));
@@ -1345,8 +1339,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "Intrinsic WaveActiveAllTrue operand must be a bool");
 
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveAllTrueIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID), {Op});
+    return EmitIntrinsicCall(ID, {Op});
   }
   case Builtin::BI__builtin_hlsl_wave_active_any_true: {
     Value *Op = EmitScalarExpr(E->getArg(0));
@@ -1354,8 +1347,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "Intrinsic WaveActiveAnyTrue operand must be a bool");
 
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveAnyTrueIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID), {Op});
+    return EmitIntrinsicCall(ID, {Op});
   }
   case Builtin::BI__builtin_hlsl_wave_active_bit_or: {
     Value *Op = EmitScalarExpr(E->getArg(0));
@@ -1364,9 +1356,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "representation");
 
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveBitOrIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), ID, {Op->getType()}),
-                           ArrayRef{Op}, "hlsl.wave.active.bit.or");
+    return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op},
+                             "hlsl.wave.active.bit.or");
   }
   case Builtin::BI__builtin_hlsl_wave_active_bit_xor: {
     Value *Op = EmitScalarExpr(E->getArg(0));
@@ -1375,9 +1366,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "representation");
 
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveBitXorIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), ID, {Op->getType()}),
-                           ArrayRef{Op}, "hlsl.wave.active.bit.xor");
+    return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op},
+                             "hlsl.wave.active.bit.xor");
   }
   case Builtin::BI__builtin_hlsl_wave_active_bit_and: {
     Value *Op = EmitScalarExpr(E->getArg(0));
@@ -1386,9 +1376,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
            "representation");
 
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveBitAndIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), ID, {Op->getType()}),
-                           ArrayRef{Op}, "hlsl.wave.active.bit.and");
+    return EmitIntrinsicCall(ID, {Op->getType()}, ArrayRef{Op},
+                             "hlsl.wave.active.bit.and");
   }
   case Builtin::BI__builtin_hlsl_wave_active_ballot: {
     [[maybe_unused]] Value *Op = EmitScalarExpr(E->getArg(0));
@@ -1400,9 +1389,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
   case Builtin::BI__builtin_hlsl_wave_active_count_bits: {
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveActiveCountBitsIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID),
-        ArrayRef{OpExpr});
+    return EmitIntrinsicCall(ID, ArrayRef{OpExpr});
   }
   case Builtin::BI__builtin_hlsl_wave_active_sum: {
     // Due to the use of variadic arguments, explicitly retrieve argument
@@ -1410,9 +1397,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Intrinsic::ID IID = getWaveActiveSumIntrinsic(
         getTarget().getTriple().getArch(), E->getArg(0)->getType());
 
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), IID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.wave.active.sum");
+    return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.wave.active.sum");
   }
   case Builtin::BI__builtin_hlsl_wave_active_product: {
     // Due to the use of variadic arguments, explicitly retrieve argument
@@ -1420,9 +1406,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     Intrinsic::ID IID = getWaveActiveProductIntrinsic(
         getTarget().getTriple().getArch(), E->getArg(0)->getType());
 
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), IID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.wave.active.product");
+    return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.wave.active.product");
   }
   case Builtin::BI__builtin_hlsl_wave_active_max: {
     // Due to the use of variadic arguments, explicitly retrieve argument
@@ -1434,9 +1419,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     else
       IID = CGM.getHLSLRuntime().getWaveActiveMaxIntrinsic();
 
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), IID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.wave.active.max");
+    return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.wave.active.max");
   }
   case Builtin::BI__builtin_hlsl_wave_active_min: {
     // Due to the use of variadic arguments, explicitly retrieve argument
@@ -1448,9 +1432,8 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     else
       IID = CGM.getHLSLRuntime().getWaveActiveMinIntrinsic();
 
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), IID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.wave.active.min");
+    return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.wave.active.min");
   }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
     // We don't define a SPIR-V intrinsic, instead it is a SPIR-V built-in
@@ -1458,8 +1441,7 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     // for the DirectX intrinsic and the demangled builtin name
     switch (CGM.getTarget().getTriple().getArch()) {
     case llvm::Triple::dxil:
-      return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-          &CGM.getModule(), Intrinsic::dx_wave_getlaneindex));
+      return EmitIntrinsicCall(Intrinsic::dx_wave_getlaneindex);
     case llvm::Triple::spirv:
       return EmitRuntimeCall(CGM.CreateRuntimeFunction(
           llvm::FunctionType::get(IntTy, {}, false),
@@ -1471,54 +1453,46 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
   }
   case Builtin::BI__builtin_hlsl_wave_is_first_lane: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveIsFirstLaneIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_wave_get_lane_count: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getWaveGetLaneCountIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_wave_read_lane_at: {
     // Due to the use of variadic arguments we must explicitly retrieve them and
     // create our function type.
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Value *OpIndex = EmitScalarExpr(E->getArg(1));
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(
-            &CGM.getModule(), CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(),
-            {OpExpr->getType()}),
-        ArrayRef{OpExpr, OpIndex}, "hlsl.wave.readlane");
+    return EmitIntrinsicCall(CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(),
+                             {OpExpr->getType()}, ArrayRef{OpExpr, OpIndex},
+                             "hlsl.wave.readlane");
   }
   case Builtin::BI__builtin_hlsl_wave_prefix_sum: {
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Intrinsic::ID IID = getWavePrefixSumIntrinsic(
         getTarget().getTriple().getArch(), E->getArg(0)->getType());
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), IID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.wave.prefix.sum");
+    return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.wave.prefix.sum");
   }
   case Builtin::BI__builtin_hlsl_wave_prefix_product: {
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Intrinsic::ID IID = getWavePrefixProductIntrinsic(
         getTarget().getTriple().getArch(), E->getArg(0)->getType());
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), IID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.wave.prefix.product");
+    return EmitIntrinsicCall(IID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.wave.prefix.product");
   }
   case Builtin::BI__builtin_hlsl_quad_read_across_x: {
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Intrinsic::ID ID = CGM.getHLSLRuntime().getQuadReadAcrossXIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), ID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.quad.read.across.x");
+    return EmitIntrinsicCall(ID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.quad.read.across.x");
   }
   case Builtin::BI__builtin_hlsl_quad_read_across_y: {
     Value *OpExpr = EmitScalarExpr(E->getArg(0));
     Intrinsic::ID ID = CGM.getHLSLRuntime().getQuadReadAcrossYIntrinsic();
-    return EmitRuntimeCall(Intrinsic::getOrInsertDeclaration(
-                               &CGM.getModule(), ID, {OpExpr->getType()}),
-                           ArrayRef{OpExpr}, "hlsl.quad.read.across.y");
+    return EmitIntrinsicCall(ID, {OpExpr->getType()}, ArrayRef{OpExpr},
+                             "hlsl.quad.read.across.y");
   }
   case Builtin::BI__builtin_hlsl_elementwise_sign: {
     auto *Arg0 = E->getArg(0);
@@ -1576,36 +1550,30 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID,
     return handleHlslClip(E, this);
   case Builtin::BI__builtin_hlsl_all_memory_barrier: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getAllMemoryBarrierIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_all_memory_barrier_with_group_sync: {
     Intrinsic::ID ID =
         CGM.getHLSLRuntime().getAllMemoryBarrierWithGroupSyncIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_device_memory_barrier: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getDeviceMemoryBarrierIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_device_memory_barrier_with_group_sync: {
     Intrinsic::ID ID =
         CGM.getHLSLRuntime().getDeviceMemoryBarrierWithGroupSyncIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_group_memory_barrier: {
     Intrinsic::ID ID = CGM.getHLSLRuntime().getGroupMemoryBarrierIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_group_memory_barrier_with_group_sync: {
     Intrinsic::ID ID =
         CGM.getHLSLRuntime().getGroupMemoryBarrierWithGroupSyncIntrinsic();
-    return EmitRuntimeCall(
-        Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
+    return EmitIntrinsicCall(ID);
   }
   case Builtin::BI__builtin_hlsl_elementwise_ddx_coarse: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index e0f8e62fb53af..77ca3e0fee84f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4610,6 +4610,15 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::CallInst *EmitRuntimeCall(llvm::FunctionCallee callee,
                                   ArrayRef<llvm::Value *> args,
                                   const Twine &name = "");
+  llvm::CallInst *EmitIntrinsicCall(llvm::Intrinsic::ID ID,
+                                    const Twine &Name = "");
+  llvm::CallInst *EmitIntrinsicCall(llvm::Intrinsic::ID ID,
+                                    ArrayRef<llvm::Value *> Args,
+                                    const Twine &Name = "");
+  llvm::CallInst *EmitIntrinsicCall(llvm::Intrinsic::ID ID,
+                                    ArrayRef<llvm::Type *> Types,
+                                    ArrayRef<llvm::Value *> Args,
+                                    const Twine &Name = "");
   llvm::CallInst *EmitNounwindRuntimeCall(llvm::FunctionCallee callee,
                                           const Twine &name = "");
   llvm::CallInst *EmitNounwindRuntimeCall(llvm::FunctionCallee callee,
diff --git a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl
index 90d51c716c771..0fa798a16b805 100644
--- a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrier.hlsl
@@ -11,7 +11,7 @@
 // CHECK-SPIRV: define hidden spir_func void @
 void test_AllMemoryBarrier() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].all.memory.barrier()
-// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].all.memory.barrier()
+// CHECK-SPIRV: call void @llvm.[[TARGET]].all.memory.barrier()
   AllMemoryBarrier();
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl
index 6ddb69671e094..b4a3371f7628f 100644
--- a/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/AllMemoryBarrierWithGroupSync.hlsl
@@ -11,7 +11,7 @@
 // CHECK-SPIRV: define hidden spir_func void @
 void test_AllMemoryBarrierWithGroupSync() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].all.memory.barrier.with.group.sync()
-// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].all.memory.barrier.with.group.sync()
+// CHECK-SPIRV: call void @llvm.[[TARGET]].all.memory.barrier.with.group.sync()
   AllMemoryBarrierWithGroupSync();
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl
index e2c08f7775c8c..d9613aedc1cc6 100644
--- a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrier.hlsl
@@ -11,7 +11,7 @@
 // CHECK-SPIRV: define hidden spir_func void @
 void test_DeviceMemoryBarrier() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].device.memory.barrier()
-// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].device.memory.barrier()
+// CHECK-SPIRV: call void @llvm.[[TARGET]].device.memory.barrier()
   DeviceMemoryBarrier();
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl
index fa455f5f8338b..bea7d7391aec2 100644
--- a/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/DeviceMemoryBarrierWithGroupSync.hlsl
@@ -11,7 +11,7 @@
 // CHECK-SPIRV: define hidden spir_func void @
 void test_DeviceMemoryBarrierWithGroupSync() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].device.memory.barrier.with.group.sync()
-// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].device.memory.barrier.with.group.sync()
+// CHECK-SPIRV: call void @llvm.[[TARGET]].device.memory.barrier.with.group.sync()
   DeviceMemoryBarrierWithGroupSync();
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl
index b52819973f677..d33baeac940b6 100644
--- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrier.hlsl
@@ -11,7 +11,7 @@
 // CHECK-SPIRV: define hidden spir_func void @
 void test_GroupMemoryBarrier() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier()
-// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier()
+// CHECK-SPIRV: call void @llvm.[[TARGET]].group.memory.barrier()
   GroupMemoryBarrier();
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
index e709ed3616f0d..b69f67cb8dfaa 100644
--- a/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/GroupMemoryBarrierWithGroupSync.hlsl
@@ -11,7 +11,7 @@
 // CHECK-SPIRV: define hidden spir_func void @
 void test_GroupMemoryBarrierWithGroupSync() {
 // CHECK-DXIL: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
-// CHECK-SPIRV: call spir_func void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
+// CHECK-SPIRV: call void @llvm.[[TARGET]].group.memory.barrier.with.group.sync()
   GroupMemoryBarrierWithGroupSync();
 }
 
diff --git a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl
index 54dd82b9fd485..f6bf05e524964 100644
--- a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossX.hlsl
@@ -1,169 +1,169 @@
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=dx -DCC=""
+// RUN:   --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=dx
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=dx -DCC=""
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=dx
 
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type -fnative-int16-type \
 // RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=spv -DCC="spir_func "
+// RUN:   --check-prefixes=CHECK,CHECK-NATIVE_HALF -DTARGET=spv
 // RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
 // RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=spv -DCC="spir_func " 
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NO_HALF -DTARGET=spv
 
-// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]])
+// CHECK: %[[RET:.*]] = call i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]])
 // CHECK: ret i32 %[[RET]]
 int test_int(int expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]])
 // CHECK: ret <2 x i32> %[[RET]]
 int2 test_int2(int2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]])
 // CHECK: ret <3 x i32> %[[RET]]
 int3 test_int3(int3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]])
 // CHECK: ret <4 x i32> %[[RET]]
 int4 test_int4(int4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]])
+// CHECK: %[[RET:.*]] = call i32 @llvm.[[TARGET]].quad.read.across.x.i32(i32 %[[#]])
 // CHECK: ret i32 %[[RET]]
 uint test_uint(uint expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[TARGET]].quad.read.across.x.v2i32(<2 x i32> %[[#]])
 // CHECK: ret <2 x i32> %[[RET]]
 uint2 test_uint2(uint2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[TARGET]].quad.read.across.x.v3i32(<3 x i32> %[[#]])
 // CHECK: ret <3 x i32> %[[RET]]
 uint3 test_uint3(uint3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[TARGET]].quad.read.across.x.v4i32(<4 x i32> %[[#]])
 // CHECK: ret <4 x i32> %[[RET]]
 uint4 test_uint4(uint4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]])
+// CHECK: %[[RET:.*]] = call i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]])
 // CHECK: ret i64 %[[RET]]
 int64_t test_int64_t(int64_t expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]])
 // CHECK: ret <2 x i64> %[[RET]]
 int64_t2 test_int64_t2(int64_t2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]])
 // CHECK: ret <3 x i64> %[[RET]]
 int64_t3 test_int64_t3(int64_t3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]])
 // CHECK: ret <4 x i64> %[[RET]]
 int64_t4 test_int64_t4(int64_t4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]])
+// CHECK: %[[RET:.*]] = call i64 @llvm.[[TARGET]].quad.read.across.x.i64(i64 %[[#]])
 // CHECK: ret i64 %[[RET]]
 uint64_t test_uint64_t(uint64_t expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[TARGET]].quad.read.across.x.v2i64(<2 x i64> %[[#]])
 // CHECK: ret <2 x i64> %[[RET]]
 uint64_t2 test_uint64_t2(uint64_t2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[TARGET]].quad.read.across.x.v3i64(<3 x i64> %[[#]])
 // CHECK: ret <3 x i64> %[[RET]]
 uint64_t3 test_uint64_t3(uint64_t3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[TARGET]].quad.read.across.x.v4i64(<4 x i64> %[[#]])
 // CHECK: ret <4 x i64> %[[RET]]
 uint64_t4 test_uint64_t4(uint64_t4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]])
 // CHECK: ret float %[[RET]]
 float test_float(float expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]])
 // CHECK: ret <2 x float> %[[RET]]
 float2 test_float2(float2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]])
 // CHECK: ret <3 x float> %[[RET]]
 float3 test_float3(float3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]])
 // CHECK: ret <4 x float> %[[RET]]
 float4 test_float4(float4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]double @llvm.[[TARGET]].quad.read.across.x.f64(double %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn double @llvm.[[TARGET]].quad.read.across.x.f64(double %[[#]])
 // CHECK: ret double %[[RET]]
 double test_double(double expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x double> @llvm.[[TARGET]].quad.read.across.x.v2f64(<2 x double> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.[[TARGET]].quad.read.across.x.v2f64(<2 x double> %[[#]])
 // CHECK: ret <2 x double> %[[RET]]
 double2 test_double2(double2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x double> @llvm.[[TARGET]].quad.read.across.x.v3f64(<3 x double> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.[[TARGET]].quad.read.across.x.v3f64(<3 x double> %[[#]])
 // CHECK: ret <3 x double> %[[RET]]
 double3 test_double3(double3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x double> @llvm.[[TARGET]].quad.read.across.x.v4f64(<4 x double> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.[[TARGET]].quad.read.across.x.v4f64(<4 x double> %[[#]])
 // CHECK: ret <4 x double> %[[RET]]
 double4 test_double4(double4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]half @llvm.[[TARGET]].quad.read.across.x.f16(half %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.[[TARGET]].quad.read.across.x.f16(half %[[#]])
 // CHECK-NATIVE_HALF: ret half %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[TARGET]].quad.read.across.x.f32(float %[[#]])
 // CHECK-NO_HALF: ret float %[[RET]]
 half test_half(half expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x half> @llvm.[[TARGET]].quad.read.across.x.v2f16(<2 x half> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.[[TARGET]].quad.read.across.x.v2f16(<2 x half> %[[#]])
 // CHECK-NATIVE_HALF: ret <2 x half> %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[TARGET]].quad.read.across.x.v2f32(<2 x float> %[[#]])
 // CHECK-NO_HALF: ret <2 x float> %[[RET]]
 half2 test_half2(half2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x half> @llvm.[[TARGET]].quad.read.across.x.v3f16(<3 x half> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[TARGET]].quad.read.across.x.v3f16(<3 x half> %[[#]])
 // CHECK-NATIVE_HALF: ret <3 x half> %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[TARGET]].quad.read.across.x.v3f32(<3 x float> %[[#]])
 // CHECK-NO_HALF: ret <3 x float> %[[RET]]
 half3 test_half3(half3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x half> @llvm.[[TARGET]].quad.read.across.x.v4f16(<4 x half> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.[[TARGET]].quad.read.across.x.v4f16(<4 x half> %[[#]])
 // CHECK-NATIVE_HALF: ret <4 x half> %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[TARGET]].quad.read.across.x.v4f32(<4 x float> %[[#]])
 // CHECK-NO_HALF: ret <4 x float> %[[RET]]
 half4 test_half4(half4 expr) { return QuadReadAcrossX(expr); }
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]])
 // CHECK-NATIVE_HALF: ret i16 %[[RET]]
 int16_t test_int16_t(int16_t expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]]
 int16_t2 test_int16_t2(int16_t2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]]
 int16_t3 test_int16_t3(int16_t3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]]
 int16_t4 test_int16_t4(int16_t4 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[TARGET]].quad.read.across.x.i16(i16 %[[#]])
 // CHECK-NATIVE_HALF: ret i16 %[[RET]]
 uint16_t test_uint16_t(uint16_t expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[TARGET]].quad.read.across.x.v2i16(<2 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]]
 uint16_t2 test_uint16_t2(uint16_t2 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[TARGET]].quad.read.across.x.v3i16(<3 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]]
 uint16_t3 test_uint16_t3(uint16_t3 expr) { return QuadReadAcrossX(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[TARGET]].quad.read.across.x.v4i16(<4 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]]
 uint16_t4 test_uint16_t4(uint16_t4 expr) { return QuadReadAcrossX(expr); }
 #endif
diff --git a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl
index 313c287dc1a7d..9d70545f90a28 100644
--- a/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/QuadReadAcrossY.hlsl
@@ -15,157 +15,157 @@
 // RUN:   -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV,CHECK-NO_HALF
 
 // Capture the expected interchange format so not every check needs to be duplicated
-// CHECK-DXIL: %[[RET:.*]] = call [[CC:]]i32 @llvm.[[ICF:dx]].quad.read.across.y.i32(i32 %[[#]])
-// CHECK-SPIRV: %[[RET:.*]] = call [[CC:spir_func ]]i32 @llvm.[[ICF:spv]].quad.read.across.y.i32(i32 %[[#]])
+// CHECK-DXIL: %[[RET:.*]] = call i32 @llvm.[[ICF:dx]].quad.read.across.y.i32(i32 %[[#]])
+// CHECK-SPIRV: %[[RET:.*]] = call i32 @llvm.[[ICF:spv]].quad.read.across.y.i32(i32 %[[#]])
 // CHECK: ret i32 %[[RET]]
 int test_int(int expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]])
 // CHECK: ret <2 x i32> %[[RET]]
 int2 test_int2(int2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]])
 // CHECK: ret <3 x i32> %[[RET]]
 int3 test_int3(int3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]])
 // CHECK: ret <4 x i32> %[[RET]]
 int4 test_int4(int4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]i32 @llvm.[[ICF]].quad.read.across.y.i32(i32 %[[#]])
+// CHECK: %[[RET:.*]] = call i32 @llvm.[[ICF]].quad.read.across.y.i32(i32 %[[#]])
 // CHECK: ret i32 %[[RET]]
 uint test_uint(uint expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i32> @llvm.[[ICF]].quad.read.across.y.v2i32(<2 x i32> %[[#]])
 // CHECK: ret <2 x i32> %[[RET]]
 uint2 test_uint2(uint2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i32> @llvm.[[ICF]].quad.read.across.y.v3i32(<3 x i32> %[[#]])
 // CHECK: ret <3 x i32> %[[RET]]
 uint3 test_uint3(uint3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i32> @llvm.[[ICF]].quad.read.across.y.v4i32(<4 x i32> %[[#]])
 // CHECK: ret <4 x i32> %[[RET]]
 uint4 test_uint4(uint4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]])
+// CHECK: %[[RET:.*]] = call i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]])
 // CHECK: ret i64 %[[RET]]
 int64_t test_int64_t(int64_t expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]])
 // CHECK: ret <2 x i64> %[[RET]]
 int64_t2 test_int64_t2(int64_t2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]])
 // CHECK: ret <3 x i64> %[[RET]]
 int64_t3 test_int64_t3(int64_t3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]])
 // CHECK: ret <4 x i64> %[[RET]]
 int64_t4 test_int64_t4(int64_t4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]])
+// CHECK: %[[RET:.*]] = call i64 @llvm.[[ICF]].quad.read.across.y.i64(i64 %[[#]])
 // CHECK: ret i64 %[[RET]]
 uint64_t test_uint64_t(uint64_t expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <2 x i64> @llvm.[[ICF]].quad.read.across.y.v2i64(<2 x i64> %[[#]])
 // CHECK: ret <2 x i64> %[[RET]]
 uint64_t2 test_uint64_t2(uint64_t2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <3 x i64> @llvm.[[ICF]].quad.read.across.y.v3i64(<3 x i64> %[[#]])
 // CHECK: ret <3 x i64> %[[RET]]
 uint64_t3 test_uint64_t3(uint64_t3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call [[CC]]<4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]])
+// CHECK: %[[RET:.*]] = call <4 x i64> @llvm.[[ICF]].quad.read.across.y.v4i64(<4 x i64> %[[#]])
 // CHECK: ret <4 x i64> %[[RET]]
 uint64_t4 test_uint64_t4(uint64_t4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]])
 // CHECK: ret float %[[RET]]
 float test_float(float expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]])
 // CHECK: ret <2 x float> %[[RET]]
 float2 test_float2(float2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]])
 // CHECK: ret <3 x float> %[[RET]]
 float3 test_float3(float3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]])
 // CHECK: ret <4 x float> %[[RET]]
 float4 test_float4(float4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]double @llvm.[[ICF]].quad.read.across.y.f64(double %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn double @llvm.[[ICF]].quad.read.across.y.f64(double %[[#]])
 // CHECK: ret double %[[RET]]
 double test_double(double expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x double> @llvm.[[ICF]].quad.read.across.y.v2f64(<2 x double> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x double> @llvm.[[ICF]].quad.read.across.y.v2f64(<2 x double> %[[#]])
 // CHECK: ret <2 x double> %[[RET]]
 double2 test_double2(double2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x double> @llvm.[[ICF]].quad.read.across.y.v3f64(<3 x double> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x double> @llvm.[[ICF]].quad.read.across.y.v3f64(<3 x double> %[[#]])
 // CHECK: ret <3 x double> %[[RET]]
 double3 test_double3(double3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x double> @llvm.[[ICF]].quad.read.across.y.v4f64(<4 x double> %[[#]])
+// CHECK: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x double> @llvm.[[ICF]].quad.read.across.y.v4f64(<4 x double> %[[#]])
 // CHECK: ret <4 x double> %[[RET]]
 double4 test_double4(double4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]half @llvm.[[ICF]].quad.read.across.y.f16(half %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn half @llvm.[[ICF]].quad.read.across.y.f16(half %[[#]])
 // CHECK-NATIVE_HALF: ret half %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn float @llvm.[[ICF]].quad.read.across.y.f32(float %[[#]])
 // CHECK-NO_HALF: ret float %[[RET]]
 half test_half(half expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x half> @llvm.[[ICF]].quad.read.across.y.v2f16(<2 x half> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x half> @llvm.[[ICF]].quad.read.across.y.v2f16(<2 x half> %[[#]])
 // CHECK-NATIVE_HALF: ret <2 x half> %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <2 x float> @llvm.[[ICF]].quad.read.across.y.v2f32(<2 x float> %[[#]])
 // CHECK-NO_HALF: ret <2 x float> %[[RET]]
 half2 test_half2(half2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x half> @llvm.[[ICF]].quad.read.across.y.v3f16(<3 x half> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x half> @llvm.[[ICF]].quad.read.across.y.v3f16(<3 x half> %[[#]])
 // CHECK-NATIVE_HALF: ret <3 x half> %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <3 x float> @llvm.[[ICF]].quad.read.across.y.v3f32(<3 x float> %[[#]])
 // CHECK-NO_HALF: ret <3 x float> %[[RET]]
 half3 test_half3(half3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x half> @llvm.[[ICF]].quad.read.across.y.v4f16(<4 x half> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x half> @llvm.[[ICF]].quad.read.across.y.v4f16(<4 x half> %[[#]])
 // CHECK-NATIVE_HALF: ret <4 x half> %[[RET]]
-// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[CC]]<4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]])
+// CHECK-NO_HALF: %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn <4 x float> @llvm.[[ICF]].quad.read.across.y.v4f32(<4 x float> %[[#]])
 // CHECK-NO_HALF: ret <4 x float> %[[RET]]
 half4 test_half4(half4 expr) { return QuadReadAcrossY(expr); }
 
 #ifdef __HLSL_ENABLE_16_BIT
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]])
 // CHECK-NATIVE_HALF: ret i16 %[[RET]]
 int16_t test_int16_t(int16_t expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]]
 int16_t2 test_int16_t2(int16_t2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]]
 int16_t3 test_int16_t3(int16_t3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]]
 int16_t4 test_int16_t4(int16_t4 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call i16 @llvm.[[ICF]].quad.read.across.y.i16(i16 %[[#]])
 // CHECK-NATIVE_HALF: ret i16 %[[RET]]
 uint16_t test_uint16_t(uint16_t expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <2 x i16> @llvm.[[ICF]].quad.read.across.y.v2i16(<2 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <2 x i16> %[[RET]]
 uint16_t2 test_uint16_t2(uint16_t2 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <3 x i16> @llvm.[[ICF]].quad.read.across.y.v3i16(<3 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <3 x i16> %[[RET]]
 uint16_t3 test_uint16_t3(uint16_t3 expr) { return QuadReadAcrossY(expr); }
 
-// CHECK-NATIVE_HALF: %[[RET:.*]] = call [[CC]]<4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]])
+// CHECK-NATIVE_HALF: %[[RET:.*]] = call <4 x i16> @llvm.[[ICF]].quad.read.across.y.v4i16(<4 x i16> %[[#]])
 // CHECK-NATIVE_HALF: ret <4 x i16> %[[RET]]
 uint16_t4 test_uint16_t4(uint16_t4 expr) { return QuadReadAcrossY(expr); }
 #endif
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl
index 323aa439984f9..f8bcdfdb3333f 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllEqual.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 bool test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.all.equal.i32(i32
+  // CHECK-SPIRV:  %[[RET:.*]] = call i1 @llvm.spv.wave.all.equal.i32(i32
   // CHECK-DXIL:  %[[RET:.*]] = call i1 @llvm.dx.wave.all.equal.i32(i32
   // CHECK:  ret i1 %[[RET]]
   return WaveActiveAllEqual(expr);
@@ -20,7 +20,7 @@ bool test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 bool test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.all.equal.i64(i64 
+  // CHECK-SPIRV:  %[[RET:.*]] = call i1 @llvm.spv.wave.all.equal.i64(i64 
   // CHECK-DXIL:  %[[RET:.*]] = call i1 @llvm.dx.wave.all.equal.i64(i64
   // CHECK:  ret i1 %[[RET]]
   return WaveActiveAllEqual(expr);
@@ -33,7 +33,7 @@ bool test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 bool4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call spir_func <4 x i1> @llvm.spv.wave.all.equal.v4f32(<4 x float> 
+  // CHECK-SPIRV:  %[[RET1:.*]] = call <4 x i1> @llvm.spv.wave.all.equal.v4f32(<4 x float> 
   // CHECK-DXIL:  %[[RET1:.*]] = call <4 x i1> @llvm.dx.wave.all.equal.v4f32(<4 x float> 
   // CHECK:  ret <4 x i1> %[[RET1]]
   return WaveActiveAllEqual(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
index f499fc97f43fc..94060ceb97e66 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAllTrue.hlsl
@@ -10,7 +10,7 @@
 // CHECK-LABEL: define {{.*}}test
 bool test(bool p1) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.all(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call i1 @llvm.spv.wave.all(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call i1 @llvm.dx.wave.all(i1 %{{[a-zA-Z0-9]+}})
   // CHECK:  ret i1 %[[RET]]
   return WaveActiveAllTrue(p1);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
index 3655cdb443fa9..c4b8239448f2c 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveAnyTrue.hlsl
@@ -10,7 +10,7 @@
 // CHECK-LABEL: define {{.*}}test
 bool test(bool p1) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func i1 @llvm.spv.wave.any(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call i1 @llvm.spv.wave.any(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call i1 @llvm.dx.wave.any(i1 %{{[a-zA-Z0-9]+}})
   // CHECK:  ret i1 %[[RET]]
   return WaveActiveAnyTrue(p1);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl
index df2d854a64247..4c7d5cd2a1c4a 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBallot.hlsl
@@ -10,7 +10,7 @@
 // CHECK-LABEL: define {{.*}}test
 uint4 test(bool p1) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV: %[[SPIRVRET:.*]] = call spir_func <4 x i32> @llvm.spv.subgroup.ballot(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV: %[[SPIRVRET:.*]] = call <4 x i32> @llvm.spv.subgroup.ballot(i1 %{{[a-zA-Z0-9]+}}) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL: %[[WAB:.*]] = call { i32, i32, i32, i32 } @llvm.dx.wave.ballot.i32(i1 %{{[a-zA-Z0-9]+}})
   // CHECK-DXIL-NEXT: extractvalue { i32, i32, i32, i32 } {{.*}} 0
   // CHECK-DXIL-NEXT: insertelement <4 x i32> poison, i32 {{.*}}, i32 0
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl
index a6da9678d7275..78b3feb5ade66 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBitAnd.hlsl
@@ -1,17 +1,17 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
-// RUN:   FileCheck %s --check-prefixes=CHECK,DXCHECK -DCALL="call"
+// RUN:   FileCheck %s --check-prefixes=CHECK,DXCHECK
 
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
-// RUN:   FileCheck %s --check-prefixes=CHECK,SPVCHECK -DCALL="call spir_func"
+// RUN:   FileCheck %s --check-prefixes=CHECK,SPVCHECK
 
 // Test basic lowering to runtime function call.
 
 // CHECK-LABEL: test_uint
 uint test_uint(uint expr) {
-  // DXCHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.and.i32([[TY]] %[[#]])
-  // SPVCHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.and.i32([[TY]] %[[#]])
+  // DXCHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.and.i32([[TY]] %[[#]])
+  // SPVCHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.and.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -20,7 +20,7 @@ uint test_uint(uint expr) {
 
 // CHECK-LABEL: test_uint2
 uint2 test_uint2(uint2 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -29,7 +29,7 @@ uint2 test_uint2(uint2 expr) {
 
 // CHECK-LABEL: test_uint3
 uint3 test_uint3(uint3 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -38,7 +38,7 @@ uint3 test_uint3(uint3 expr) {
 
 // CHECK-LABEL: test_uint4
 uint4 test_uint4(uint4 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -47,7 +47,7 @@ uint4 test_uint4(uint4 expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -56,7 +56,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_uint64_t2
 uint64_t2 test_uint64_t2(uint64_t2 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v2i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -65,7 +65,7 @@ uint64_t2 test_uint64_t2(uint64_t2 expr) {
 
 // CHECK-LABEL: test_uint64_t3
 uint64_t3 test_uint64_t3(uint64_t3 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v3i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
@@ -74,7 +74,7 @@ uint64_t3 test_uint64_t3(uint64_t3 expr) {
 
 // CHECK-LABEL: test_uint64_t4
 uint64_t4 test_uint64_t4(uint64_t4 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.and.v4i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitAnd(expr);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl
index 80364724448fa..f92dec830256c 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBitOr.hlsl
@@ -1,17 +1,17 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
-// RUN:   FileCheck %s --check-prefixes=CHECK,DXCHECK -DCALL="call"
+// RUN:   FileCheck %s --check-prefixes=CHECK,DXCHECK
 
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
-// RUN:   FileCheck %s --check-prefixes=CHECK,SPVCHECK -DCALL="call spir_func"
+// RUN:   FileCheck %s --check-prefixes=CHECK,SPVCHECK
 
 // Test basic lowering to runtime function call.
 
 // CHECK-LABEL: test_uint
 uint test_uint(uint expr) {
-  // DXCHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.or.i32([[TY]] %[[#]])
-  // SPVCHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.or.i32([[TY]] %[[#]])
+  // DXCHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.or.i32([[TY]] %[[#]])
+  // SPVCHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.or.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -20,7 +20,7 @@ uint test_uint(uint expr) {
 
 // CHECK-LABEL: test_uint2
 uint2 test_uint2(uint2 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -29,7 +29,7 @@ uint2 test_uint2(uint2 expr) {
 
 // CHECK-LABEL: test_uint3
 uint3 test_uint3(uint3 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -38,7 +38,7 @@ uint3 test_uint3(uint3 expr) {
 
 // CHECK-LABEL: test_uint4
 uint4 test_uint4(uint4 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -47,7 +47,7 @@ uint4 test_uint4(uint4 expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -56,7 +56,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_uint64_t2
 uint64_t2 test_uint64_t2(uint64_t2 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v2i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -65,7 +65,7 @@ uint64_t2 test_uint64_t2(uint64_t2 expr) {
 
 // CHECK-LABEL: test_uint64_t3
 uint64_t3 test_uint64_t3(uint64_t3 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v3i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
@@ -74,7 +74,7 @@ uint64_t3 test_uint64_t3(uint64_t3 expr) {
 
 // CHECK-LABEL: test_uint64_t4
 uint64_t4 test_uint64_t4(uint64_t4 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.or.v4i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitOr(expr);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl
index 9c94663390843..9d04ba92a3242 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveBitXor.hlsl
@@ -1,17 +1,17 @@
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
 // RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
-// RUN:   FileCheck %s --check-prefixes=CHECK,DXCHECK -DCALL="call"
+// RUN:   FileCheck %s --check-prefixes=CHECK,DXCHECK
 
 // RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -triple \
 // RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
-// RUN:   FileCheck %s --check-prefixes=CHECK,SPVCHECK -DCALL="call spir_func"
+// RUN:   FileCheck %s --check-prefixes=CHECK,SPVCHECK
 
 // Test basic lowering to runtime function call.
 
 // CHECK-LABEL: test_uint
 uint test_uint(uint expr) {
-  // DXCHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.xor.i32([[TY]] %[[#]])
-  // SPVCHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.xor.i32([[TY]] %[[#]])
+  // DXCHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:dx]].wave.reduce.xor.i32([[TY]] %[[#]])
+  // SPVCHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF:spv]].wave.reduce.xor.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -20,7 +20,7 @@ uint test_uint(uint expr) {
 
 // CHECK-LABEL: test_uint2
 uint2 test_uint2(uint2 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -29,7 +29,7 @@ uint2 test_uint2(uint2 expr) {
 
 // CHECK-LABEL: test_uint3
 uint3 test_uint3(uint3 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -38,7 +38,7 @@ uint3 test_uint3(uint3 expr) {
 
 // CHECK-LABEL: test_uint4
 uint4 test_uint4(uint4 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i32([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -47,7 +47,7 @@ uint4 test_uint4(uint4 expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -56,7 +56,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_uint64_t2
 uint64_t2 test_uint64_t2(uint64_t2 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v2i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -65,7 +65,7 @@ uint64_t2 test_uint64_t2(uint64_t2 expr) {
 
 // CHECK-LABEL: test_uint64_t3
 uint64_t3 test_uint64_t3(uint64_t3 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v3i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
@@ -74,7 +74,7 @@ uint64_t3 test_uint64_t3(uint64_t3 expr) {
 
 // CHECK-LABEL: test_uint64_t4
 uint64_t4 test_uint64_t4(uint64_t4 expr) {
-  // CHECK:  %[[RET:.*]] = [[CALL]] [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i64([[TY]] %[[#]])
+  // CHECK:  %[[RET:.*]] = call [[TY:.*]] @llvm.[[ICF]].wave.reduce.xor.v4i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveBitXor(expr);
 }
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
index be05a17cc3692..a4628ad103e0d 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMax.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 int test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.max.i32([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.max.i32([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.max.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveMax(expr);
@@ -20,7 +20,7 @@ int test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umax.i64([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.umax.i64([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umax.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveMax(expr);
@@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.max.v4f32([[TY1]] %[[#]]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.reduce.max.v4f32([[TY1]] %[[#]]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.max.v4f32([[TY1]] %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WaveActiveMax(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
index 1194f842deed6..f2e3686947f51 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveMin.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 int test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.min.i32([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.min.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveMin(expr);
@@ -20,7 +20,7 @@ int test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.umin.i64([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.umin.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveMin(expr);
@@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.reduce.min.v4f32([[TY1]] %[[#]]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.min.v4f32([[TY1]] %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WaveActiveMin(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl
index 3a8320e7333fc..0247b7cbeb0f6 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveProduct.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 int test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.product.i32([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.product.i32([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.product.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveProduct(expr);
@@ -20,7 +20,7 @@ int test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.product.i64([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.product.i64([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.uproduct.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveProduct(expr);
@@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.product.v4f32([[TY1]] %[[#]]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.product.v4f32([[TY1]] %[[#]]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.product.v4f32([[TY1]] %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WaveActiveProduct(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
index 1fc93c62c8db0..6caa3d775f0d2 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveActiveSum.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 int test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.sum.i32([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.sum.i32([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.sum.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveSum(expr);
@@ -20,7 +20,7 @@ int test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.reduce.sum.i64([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.reduce.sum.i64([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.reduce.usum.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveActiveSum(expr);
@@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]] %[[#]]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.reduce.sum.v4f32([[TY1]] %[[#]]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.reduce.sum.v4f32([[TY1]] %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WaveActiveSum(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl b/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl
index 25d9074b08a68..bfd42740ac4ed 100644
--- a/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WavePrefixCountBits.hlsl
@@ -18,7 +18,7 @@ int test_int(bool expr) {
   // CHECK: %[[LOADEDVAL:.*]] = load i32, ptr %[[EXPRADDR]], align 4
   // CHECK: %[[TRUNCLOADEDVAL:.*]] = icmp ne i32 %[[LOADEDVAL]], 0
 
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.subgroup.prefix.bit.count(i1 %[[TRUNCLOADEDVAL]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.subgroup.prefix.bit.count(i1 %[[TRUNCLOADEDVAL]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.bit.count(i1 %[[TRUNCLOADEDVAL]])
   // CHECK: ret [[TY]] %[[RET]]
   return WavePrefixCountBits(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl b/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl
index a45cbf29b87f2..a4dc01527a7f2 100644
--- a/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WavePrefixProduct.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 int test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.product.i32([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.product.i32([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.product.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WavePrefixProduct(expr);
@@ -20,7 +20,7 @@ int test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.product.i64([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.product.i64([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.uproduct.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WavePrefixProduct(expr);
@@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.prefix.product.v4f32([[TY1]] %[[#]]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.prefix.product.v4f32([[TY1]] %[[#]]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.prefix.product.v4f32([[TY1]] %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WavePrefixProduct(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl b/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl
index f22aa69ba45d5..a1df3fe02c802 100644
--- a/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WavePrefixSum.hlsl
@@ -9,7 +9,7 @@
 
 // CHECK-LABEL: test_int
 int test_int(int expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.sum.i32([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.sum.i32([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.sum.i32([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WavePrefixSum(expr);
@@ -20,7 +20,7 @@ int test_int(int expr) {
 
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr) {
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.prefix.sum.i64([[TY]] %[[#]])
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.prefix.sum.i64([[TY]] %[[#]])
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.prefix.usum.i64([[TY]] %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WavePrefixSum(expr);
@@ -33,7 +33,7 @@ uint64_t test_uint64_t(uint64_t expr) {
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr) {
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.prefix.sum.v4f32([[TY1]] %[[#]]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.prefix.sum.v4f32([[TY1]] %[[#]]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.prefix.sum.v4f32([[TY1]] %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WavePrefixSum(expr);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
index da6cbc40a79bb..24252f3fa3207 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -10,7 +10,7 @@
 // CHECK-LABEL: test_int
 int test_int(int expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -22,7 +22,7 @@ int test_int(int expr, uint idx) {
 // CHECK-LABEL: test_uint
 uint test_uint(uint expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -31,7 +31,7 @@ uint test_uint(uint expr, uint idx) {
 // CHECK-LABEL: test_int64_t
 int64_t test_int64_t(int64_t expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -43,7 +43,7 @@ int64_t test_int64_t(int64_t expr, uint idx) {
 // CHECK-LABEL: test_uint64_t
 uint64_t test_uint64_t(uint64_t expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i64([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -53,7 +53,7 @@ uint64_t test_uint64_t(uint64_t expr, uint idx) {
 // CHECK-LABEL: test_int16
 int16_t test_int16(int16_t expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -65,7 +65,7 @@ int16_t test_int16(int16_t expr, uint idx) {
 // CHECK-LABEL: test_uint16
 uint16_t test_uint16(uint16_t expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -77,7 +77,7 @@ uint16_t test_uint16(uint16_t expr, uint idx) {
 // CHECK-LABEL: test_half
 half test_half(half expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok2:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.dx.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -89,7 +89,7 @@ half test_half(half expr, uint idx) {
 // CHECK-LABEL: test_double
 double test_double(double expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok3:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call reassoc nnan ninf nsz arcp afn [[TY:.*]] @llvm.dx.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
@@ -101,7 +101,7 @@ double test_double(double expr, uint idx) {
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok4:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn spir_func [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
   // CHECK-DXIL:  %[[RET1:.*]] = call reassoc nnan ninf nsz arcp afn [[TY1:.*]] @llvm.dx.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WaveReadLaneAt(expr, idx);
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl
index 8072f6d4ea206..fdf019262d8cb 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_count.hlsl
@@ -14,13 +14,13 @@ void main() {
   while (a) {
 
 // CHECK-DXIL:  %[[#]] = call i32 @llvm.dx.wave.get.lane.count()
-// CHECK-SPIRV: %[[#]] = call spir_func i32 @llvm.spv.wave.get.lane.count()
+// CHECK-SPIRV: %[[#]] = call i32 @llvm.spv.wave.get.lane.count()
 // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ]
     a = WaveGetLaneCount();
   }
 
 // CHECK-DXIL:  %[[#]] = call i32 @llvm.dx.wave.get.lane.count()
-// CHECK-SPIRV: %[[#]] = call spir_func i32 @llvm.spv.wave.get.lane.count()
+// CHECK-SPIRV: %[[#]] = call i32 @llvm.spv.wave.get.lane.count()
 // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ]
   b = WaveGetLaneCount();
 }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
index 2fb6defb896f9..18860c321eb91 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
@@ -13,7 +13,7 @@ void main() {
   while (true) {
 
 // CHECK-DXIL:  %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
-// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
 // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ]
     if (WaveIsFirstLane()) {
       break;
@@ -21,7 +21,7 @@ void main() {
   }
 
 // CHECK-DXIL:  %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
-// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
 // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ]
   if (WaveIsFirstLane()) {
     return;

From f0ad8ee185bc3e78f908c9fc6673489dd6ab03c1 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Wed, 13 May 2026 20:17:00 -0700
Subject: [PATCH 20/95] [lldb] Don't read live memory for assembly inst
 emulation (#197601)

In 2021, Augusto changed the Target::ReadMemory API from taking a
`prefer_file_cache` argument to taking a `force_live_memory` argument,
with opposite meanings - where we used to pass true, the callers now
needed to pass false. The default argument was false, so many callers
omitted the argument altogether after the change.

One of the edits to
UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly
unintentionally swapped the intended behavior -- this method which reads
the bytes of a function's instructions for emulation should get the
bytes from the local binary, if possible, else read from live memory.
But it was changed to force reading from live memory unconditionally.
This leads to an extra memory read for every function we see for the
first time in a single `lldb` process run (the UnwindTable they are
added to is part of the Module, and kept in the global Module cache).

It's not a major perf regression, but these are extra memory reads that
we don't need to be doing.

I audited all the other changes in the 2021 PR and this was the only
mistake like this.

rdar://177026608
---
 .../InstEmulation/UnwindAssemblyInstEmulation.cpp              | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
index 19ae1cf392efa..553998c903eea 100644
--- a/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
+++ b/lldb/source/Plugins/UnwindAssembly/InstEmulation/UnwindAssemblyInstEmulation.cpp
@@ -41,10 +41,9 @@ bool UnwindAssemblyInstEmulation::GetNonCallSiteUnwindPlanFromAssembly(
   ProcessSP process_sp(thread.GetProcess());
   if (process_sp) {
     Status error;
-    const bool force_live_memory = true;
     if (process_sp->GetTarget().ReadMemory(
             range.GetBaseAddress(), function_text.data(), range.GetByteSize(),
-            error, force_live_memory) != range.GetByteSize()) {
+            error) != range.GetByteSize()) {
       return false;
     }
   }

From 2ee4669dd4f03339e501fd0e4f3cd7aa7692ac83 Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 13 May 2026 20:35:34 -0700
Subject: [PATCH 21/95] [CIR] Lower 'init' functions for global TLS (#197460)

This is the last patch for global/namespace thread-local variables. This
patch emits the final 'init' function, which calls all other init
functions, plus does the guard variable for the unordered variants.
---
 .../Dialect/Transforms/LoweringPrepare.cpp    | 105 +++++++++++++++++-
 .../test/CIR/CodeGen/global-tls-dyn-init.cpp  |  40 +++++--
 .../CIR/CodeGen/global-tls-simple-init.cpp    |  40 +++++--
 .../test/CIR/CodeGen/global-tls-templates.cpp |  42 ++++++-
 4 files changed, 202 insertions(+), 25 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index d61273b24c3d3..b910ca3c8286c 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -129,6 +129,12 @@ struct LoweringPreparePass
   /// Get the declaration for the 'wrapper' function for a global-TLS variable.
   cir::FuncOp getOrCreateThreadLocalWrapper(CIRBaseBuilderTy &builder,
                                             cir::GlobalOp op);
+  // Function that generates the guard global variable, get-global, and 'if'
+  // condition for global TLS init function generation. This inserts an 'if'
+  // with the store at the beginning of the 'then' region, so inserts into the
+  // body should happen after that.
+  cir::IfOp buildGlobalTlsGuardCheck(CIRBaseBuilderTy &builder,
+                                     mlir::Location loc, cir::GlobalOp guard);
   /// Handle the dtor region by registering destructor with __cxa_atexit
   cir::FuncOp getOrCreateDtorFunc(CIRBaseBuilderTy &builder, cir::GlobalOp op,
                                   mlir::Region &dtorRegion,
@@ -181,6 +187,10 @@ struct LoweringPreparePass
   /// Get or create the __init_tls function.
   cir::FuncOp getTlsInitFn();
 
+  // Create the __tls_guard variable.
+  cir::GlobalOp createGlobalThreadLocalGuard(CIRBaseBuilderTy &builder,
+                                             mlir::Location loc);
+
   /// Create a guard global variable for a static local.
   cir::GlobalOp createGuardGlobalOp(CIRBaseBuilderTy &builder,
                                     mlir::Location loc, llvm::StringRef name,
@@ -377,6 +387,9 @@ struct LoweringPreparePass
     entryBB.getOperations().splice(entryBB.end(), dtorBlock.getOperations(),
                                    dtorBlock.begin(),
                                    std::prev(dtorBlock.end()));
+    // make sure we leave the insert location after the operations we just
+    // inserted.
+    builder.setInsertionPointToEnd(&entryBB);
   }
 
   /// Emit the guarded initialization for a static local variable.
@@ -1143,20 +1156,44 @@ LoweringPreparePass::buildCXXGlobalVarDeclInitFunc(cir::GlobalOp op) {
   // the function entry, and discard extra blocks (which contain only
   // unreachable terminators from EH cleanup paths).
   mlir::Block *entryBB = f.addEntryBlock();
+  builder.setInsertionPointToStart(entryBB);
+
+  // If this is a global TLS variable (that is, declared at namespace scope), we
+  // have to emit the guard variable here.
+  bool needsTlsGuard = op.getDynTlsRefs() && op.getDynTlsRefs()->getGuardName();
+  cir::IfOp guardIf;
+  if (needsTlsGuard) {
+    guardIf = buildGlobalTlsGuardCheck(
+        builder, op.getLoc(),
+        getOrCreateStaticLocalDeclGuardAddress(
+            builder, op, op.getDynTlsRefs()->getGuardName().getValue(),
+            /*isLocalVarDecl=*/false,
+            /*useInt8GuardVariable=*/op.hasInternalLinkage()));
+    builder.setInsertionPointToEnd(&guardIf.getThenRegion().front());
+  }
+
   if (!op.getCtorRegion().empty()) {
     mlir::Block &block = op.getCtorRegion().front();
-    entryBB->getOperations().splice(entryBB->begin(), block.getOperations(),
-                                    block.begin(), std::prev(block.end()));
+    mlir::Block *insertBlock = builder.getBlock();
+    insertBlock->getOperations().splice(insertBlock->end(),
+                                        block.getOperations(), block.begin(),
+                                        std::prev(block.end()));
   }
 
   // Register the destructor call with __cxa_atexit
   mlir::Region &dtorRegion = op.getDtorRegion();
   if (!dtorRegion.empty()) {
     assert(!cir::MissingFeatures::astVarDeclInterface());
-    assert(!cir::MissingFeatures::opGlobalThreadLocal());
 
     emitGlobalGuardedDtorRegion(builder, op, dtorRegion,
-                                op.getTlsModel().has_value(), *entryBB);
+                                op.getTlsModel().has_value(),
+                                *builder.getBlock());
+  }
+
+  // If we're actually in the 'if' above, create a yield.
+  if (needsTlsGuard) {
+    builder.setInsertionPointToEnd(&guardIf.getThenRegion().back());
+    cir::YieldOp::create(builder, op.getLoc());
   }
 
   // Replace cir.yield with cir.return
@@ -1710,9 +1747,56 @@ void LoweringPreparePass::buildGlobalCtorDtorList() {
   }
 }
 
+cir::GlobalOp
+LoweringPreparePass::createGlobalThreadLocalGuard(CIRBaseBuilderTy &builder,
+                                                  mlir::Location loc) {
+  mlir::OpBuilder::InsertionGuard guard(builder);
+  builder.setInsertionPointToStart(mlirModule.getBody());
+
+  // The TLS Guard is always an Int8Ty.
+  cir::IntType guardTy = builder.getSIntNTy(8);
+  auto g = cir::GlobalOp::create(builder, loc, "__tls_guard", guardTy);
+  g.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
+      builder.getContext(), cir::GlobalLinkageKind::InternalLinkage));
+  g.setAlignment(clang::CharUnits::One().getAsAlign().value());
+  // At the moment, we only have implementation for this mode, as it is the
+  // default.  At one point we might need to load this mode from the module.
+  g.setTlsModel(TLS_Model::GeneralDynamic);
+  g.setInitialValueAttr(cir::IntAttr::get(guardTy, 0));
+  return g;
+}
+
+cir::IfOp LoweringPreparePass::buildGlobalTlsGuardCheck(
+    CIRBaseBuilderTy &builder, mlir::Location loc, cir::GlobalOp guard) {
+  cir::GetGlobalOp getGuard = builder.createGetGlobal(guard, /*tls=*/true);
+  mlir::Value getGuardValue = getGuard;
+
+  // Classic codegen always just loads the first byte of the guard instead of
+  // the whole thing. __tls_guard is already only 8 bits, but for the case of
+  // unordered TLS, it gets created as 64 bits.
+  if (guard.getSymType() != builder.getSIntNTy(8))
+    getGuardValue = builder.createBitcast(
+        getGuard, cir::PointerType::get(builder.getSIntNTy(8)));
+
+  mlir::Value guardLoad =
+      builder.createAlignedLoad(loc, getGuardValue, *guard.getAlignment());
+  auto zero = builder.getConstantInt(loc, builder.getSIntNTy(8), 0);
+  cir::CmpOp compare =
+      builder.createCompare(loc, cir::CmpOpKind::eq, guardLoad, zero);
+  return cir::IfOp::create(
+      builder, loc, compare,
+      /*withElseRegion=*/false, [&](mlir::OpBuilder &, mlir::Location loc) {
+        // Classic codegen still does this store as a i8, but it doesn't seem
+        // reasonable to do an i8 store into a 64 bit value?
+        builder.createStore(
+            loc, builder.getConstantInt(loc, guard.getSymType(), 1), getGuard);
+      });
+}
+
 void LoweringPreparePass::buildCXXGlobalTlsFunc() {
   if (globalThreadLocalInitializers.empty())
     return;
+
   // The global-ordered-init function for TLS variables just calls each of the
   // init-functions in order after doing a guard.
 
@@ -1721,9 +1805,20 @@ void LoweringPreparePass::buildCXXGlobalTlsFunc() {
   CIRBaseBuilderTy builder(getContext());
   mlir::Block *entryBB = tlsInit.addEntryBlock();
   builder.setInsertionPointToStart(entryBB);
-  // Note: a followup patch will emit the body here correctly.
+
+  cir::IfOp ifOperation = buildGlobalTlsGuardCheck(
+      builder, loc, createGlobalThreadLocalGuard(builder, loc));
+
+  // Emit the body of the guarded spot.
+  builder.setInsertionPointToEnd(&ifOperation.getThenRegion().front());
+  for (cir::FuncOp initFunc : globalThreadLocalInitializers)
+    builder.createCallOp(loc, initFunc, {});
+  cir::YieldOp::create(builder, loc);
+
+  builder.setInsertionPointAfter(ifOperation);
   cir::ReturnOp::create(builder, loc);
 }
+
 void LoweringPreparePass::buildCXXGlobalInitFunc() {
   if (dynamicInitializers.empty())
     return;
diff --git a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp
index f471d586aa850..1359ac84b3cc3 100644
--- a/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp
+++ b/clang/test/CIR/CodeGen/global-tls-dyn-init.cpp
@@ -10,6 +10,7 @@ struct CtorDtor {
     int i;
 };
 
+// LLVM-BOTH-DAG: @__tls_guard = internal thread_local global i8 0, align 1
 // LLVM-BOTH-DAG: @__dso_handle = external hidden global i8
 // LLVM-BOTH-DAG: @tls_cd = thread_local global %struct.CtorDtor { i32 5 }, align 4
 // LLVM-BOTH-DAG: @tls_cd_dyn = thread_local global %struct.CtorDtor zeroinitializer, align 4
@@ -22,6 +23,7 @@ struct CtorDtor {
 // LLVM-BOTH-DAG: @_ZTH6tls_cd = alias void (), ptr @__tls_init
 
 // Wrappers & aliases.
+// CIR:       cir.global internal tls_dyn @__tls_guard = #cir.int<0> : !s8i {alignment = 1 : i64}
 // CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW19tls_cd_dyn_not_used() -> !cir.ptr<!rec_CtorDtor> {
 // CIR: cir.call @_ZTH19tls_cd_dyn_not_used() : () -> ()
 // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr<!rec_CtorDtor>
@@ -47,6 +49,18 @@ struct CtorDtor {
 // CIR: cir.func @_ZTH6tls_cd() alias(@__tls_init)
 
 // CIR-LABEL: cir.func internal private @__tls_init() {
+// CIR: %[[GET_GUARD:.*]] = cir.get_global thread_local @__tls_guard : !cir.ptr<!s8i>
+// CIR: %[[LOAD_GUARD:.*]] = cir.load align(1) %[[GET_GUARD]] : !cir.ptr<!s8i>, !s8i
+// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i
+// CIR: %[[CMP:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i
+// CIR: cir.if %[[CMP]] {
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s8i
+// CIR:   cir.store %[[ONE]], %[[GET_GUARD]] : !s8i, !cir.ptr<!s8i>
+// CIR:   cir.call @[[TLS_CD_INIT:.*]]() : () -> ()
+// CIR:   cir.call @[[TLS_CD_DYN_INIT:.*]]() : () -> ()
+// CIR:   cir.call @[[TLS_CD_REF_INIT:.*]]() : () -> ()
+// CIR:   cir.call @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() : () -> ()
+// CIR:  }
 // CIR:  cir.return
 
 // LLVM: define weak_odr hidden ptr @_ZTW19tls_cd_dyn_not_used() {
@@ -74,6 +88,16 @@ struct CtorDtor {
 // LLVM: }
 //
 // LLVM: define internal void @__tls_init() {
+// LLVM:   %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__tls_guard)
+// LLVM:   %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 1
+// LLVM:   %[[IS_UNINIT:.*]] = icmp eq i8 %[[LOAD_GUARD]], 0
+// LLVM:   br i1 %[[IS_UNINIT]]
+// LLVM
+// LLVM:   store i8 1, ptr %[[GET_GUARD]], align 1
+// LLVM:   call void @[[TLS_CD_INIT:.*]]()
+// LLVM:   call void @[[TLS_CD_DYN_INIT:.*]]()
+// LLVM:   call void @[[TLS_CD_REF_INIT:.*]]()
+// LLVM:   call void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]()
 
 thread_local CtorDtor tls_cd = 5;
 // CIR-BEFORE-LPP: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor dtor {
@@ -81,7 +105,7 @@ thread_local CtorDtor tls_cd = 5;
 // CIR-BEFORE-LPP:   cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CIR-BEFORE-LPP: }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW6tls_cd", "_ZTH6tls_cd"> @tls_cd = #cir.const_record<{#cir.int<5> : !s32i}> : !rec_CtorDtor
-// CIR: cir.func internal private @[[TLS_CD_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_CD_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd : !cir.ptr<!rec_CtorDtor>
 // CIR:   %[[GET_DTOR:.*]] = cir.get_global @_ZN8CtorDtorD1Ev : !cir.ptr<!cir.func<(!cir.ptr<!rec_CtorDtor>)>>
 // CIR:   %[[DTOR_DECAY:.*]] = cir.cast bitcast %[[GET_DTOR]] : !cir.ptr<!cir.func<(!cir.ptr<!rec_CtorDtor>)>> -> !cir.ptr<!cir.func<(!cir.ptr<!void>)>>
@@ -90,7 +114,7 @@ thread_local CtorDtor tls_cd = 5;
 // CIR:   cir.call @__cxa_thread_atexit(%[[DTOR_DECAY]], %[[GLOB_DECAY]], %[[DSOHANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
 // CIR:   cir.return
 //
-// LLVM: define internal void @[[TLS_CD_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_CD_INIT]]() {
 // OGCG: define internal void @[[TLS_CD_INIT:.*]]() {{.*}}{
 // LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd)
 // LLVM:   call void @__cxa_thread_atexit(ptr @_ZN8CtorDtorD1Ev, ptr %[[GET_GLOB]], ptr @__dso_handle)
@@ -107,7 +131,7 @@ thread_local CtorDtor tls_cd_dyn = get_i();
 // CIR-BEFORE-LPP:    cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CIR-BEFORE-LPP:  }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_dyn", "_ZTH10tls_cd_dyn"> @tls_cd_dyn = #cir.zero : !rec_CtorDtor
-// CIR: cir.func internal private @[[TLS_CD_DYN_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_CD_DYN_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn : !cir.ptr<!rec_CtorDtor>
 // CIR:   %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR:   cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]])
@@ -119,7 +143,7 @@ thread_local CtorDtor tls_cd_dyn = get_i();
 // CIR:   cir.call @__cxa_thread_atexit(%[[DTOR_DECAY]], %[[GLOB_DECAY]], %[[DSOHANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
 // CIR:   cir.return
 //
-// LLVM: define internal void @[[TLS_CD_DYN_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_CD_DYN_INIT]]() {
 // OGCG: define internal void @[[TLS_CD_DYN_INIT:.*]]() {{.*}} {
 // LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_dyn)
 // LLVM-BOTH:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
@@ -137,13 +161,13 @@ thread_local CtorDtor &tls_cd_ref = tls_cd_dyn;
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CIR-BEFORE-LPP: }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW10tls_cd_ref", "_ZTH10tls_cd_ref"> @tls_cd_ref = #cir.ptr<null> : !cir.ptr<!rec_CtorDtor>
-// CIR: cir.func internal private @[[TLS_CD_REF_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_CD_REF_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_ref : !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CIR:   %[[GET_DYN:.*]] = cir.call @_ZTW10tls_cd_dyn() : () -> !cir.ptr<!rec_CtorDtor>
 // CIR:   cir.store align(8) %[[GET_DYN]], %[[GET_GLOB]] : !cir.ptr<!rec_CtorDtor>, !cir.ptr<!cir.ptr<!rec_CtorDtor>>
 // CIR:   cir.return
 //
-// LLVM: define internal void @[[TLS_CD_REF_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_CD_REF_INIT]]() {
 // OGCG: define internal void @[[TLS_CD_REF_INIT:.*]]() {{.*}} {
 // LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_ref)
 // LLVM-BOTH:   %[[CALL:.*]] = call ptr @_ZTW10tls_cd_dyn()
@@ -167,7 +191,7 @@ thread_local CtorDtor tls_cd_dyn_not_used = get_i();
 // CIR-BEFORE-LPP:   cir.call @_ZN8CtorDtorD1Ev(%[[GET_GLOB]]) : (!cir.ptr<!rec_CtorDtor>) -> ()
 // CIR-BEFORE-LPP: }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW19tls_cd_dyn_not_used", "_ZTH19tls_cd_dyn_not_used"> @tls_cd_dyn_not_used = #cir.zero : !rec_CtorDtor
-// CIR: cir.func internal private @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_CD_DYN_NOT_USED_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_cd_dyn_not_used : !cir.ptr<!rec_CtorDtor>
 // CIR:   %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR:   cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]])
@@ -179,7 +203,7 @@ thread_local CtorDtor tls_cd_dyn_not_used = get_i();
 // CIR:   cir.call @__cxa_thread_atexit(%[[DTOR_DECAY]], %[[GLOB_DECAY]], %[[DSOHANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
 // CIR:   cir.return
 //
-// LLVM: define internal void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_CD_DYN_NOT_USED_INIT]]() {
 // OGCG: define internal void @[[TLS_CD_DYN_NOT_USED_INIT:.*]]() {{.*}} {
 // LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @tls_cd_dyn_not_used)
 // LLVM-BOTH:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
diff --git a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp
index c14c21358ebff..4d480bfa64f5b 100644
--- a/clang/test/CIR/CodeGen/global-tls-simple-init.cpp
+++ b/clang/test/CIR/CodeGen/global-tls-simple-init.cpp
@@ -54,12 +54,25 @@ struct CtorDtor {
 
 // Full init of all variables (func names below).
 // CIR-LABEL: cir.func internal private @__tls_init() {
+// CIR: %[[GET_GUARD:.*]] = cir.get_global thread_local @__tls_guard : !cir.ptr<!s8i>
+// CIR: %[[LOAD_GUARD:.*]] = cir.load align(1) %[[GET_GUARD]] : !cir.ptr<!s8i>, !s8i
+// CIR: %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i
+// CIR: %[[CMP:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i
+// CIR: cir.if %[[CMP]] {
+// CIR:   %[[ONE:.*]] = cir.const #cir.int<1> : !s8i
+// CIR:   cir.store %[[ONE]], %[[GET_GUARD]] : !s8i, !cir.ptr<!s8i>
+// CIR:   cir.call @[[TLS_INT_DYN_INIT:.*]]() : () -> ()
+// CIR:   cir.call @[[TLS_INT_REF_INIT:.*]]() : () -> ()
+// CIR:   cir.call @[[TLS_INT_SELF_REF_INIT:.*]]() : () -> ()
+// CIR:   cir.call @[[DEF_INITED_DYN:.*]]() : () -> ()
+// CIR: }
 // CIR: cir.return
 
 // CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW7tls_int() -> !cir.ptr<!s32i> {
 // CIR: %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int : !cir.ptr<!s32i>
 // CIR: cir.return %[[GET_GLOB]]
 
+// LLVM-BOTH-DAG: @__tls_guard = internal thread_local global i8 0, align 1
 // LLVM-BOTH-DAG: @tls_int = thread_local global i32 5, align 4
 // LLVM-BOTH-DAG: @tls_int_dyn = thread_local global i32 0, align 4
 // LLVM-BOTH-DAG: @tls_int_ref = thread_local global ptr null, align 8
@@ -107,6 +120,17 @@ struct CtorDtor {
 // LLVM:   ret ptr %[[GET_GLOB]]
 
 // LLVM: define internal void @__tls_init() {
+// LLVM:   %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @__tls_guard)
+// LLVM:   %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 1
+// LLVM:   %[[IS_UNINIT:.*]] = icmp eq i8 %[[LOAD_GUARD]], 0
+// LLVM:   br i1 %[[IS_UNINIT]]
+//
+// LLVM:   store i8 1, ptr %[[GET_GUARD]], align 1
+// LLVM:   call void @[[TLS_INT_DYN_INIT:.*]]()
+// LLVM:   call void @[[TLS_INT_REF_INIT:.*]]()
+// LLVM:   call void @[[TLS_INT_SELF_REF_INIT:.*]]()
+// LLVM:   call void @[[DEF_INITED_DYN:.*]]()
+// LLVM:   br label 
 // LLVM:   ret void
 
 // LLVM: define weak_odr hidden ptr @_ZTW7tls_int() {
@@ -126,12 +150,12 @@ thread_local int tls_int_dyn = get_i();
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
 // CIR-BEFORE-LPP: }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_dyn", "_ZTH11tls_int_dyn"> @tls_int_dyn = #cir.int<0> : !s32i 
-// CIR: cir.func internal private @[[TLS_INT_DYN_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_INT_DYN_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_dyn : !cir.ptr<!s32i>
 // CIR:   %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
 // CIR:   cir.return
-// LLVM: define internal void @[[TLS_INT_DYN_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_INT_DYN_INIT]]() {
 // OGCG: define internal void @[[TLS_INT_DYN_INIT:.*]]()
 // OGCG:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
 // LLVM-BOTH:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_dyn)
@@ -146,12 +170,12 @@ thread_local int &tls_int_ref = tls_int_dyn;
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[GET_OTHER]], %[[GET_GLOB]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CIR-BEFORE-LPP: }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW11tls_int_ref", "_ZTH11tls_int_ref"> @tls_int_ref = #cir.ptr<null> : !cir.ptr<!s32i>
-// CIR: cir.func internal private @[[TLS_INT_REF_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_INT_REF_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_ref : !cir.ptr<!cir.ptr<!s32i>>
 // CIR:   %[[GET_REF:.*]] = cir.call @_ZTW11tls_int_dyn() : () -> !cir.ptr<!s32i>
 // CIR:   cir.store {{.*}}%[[GET_REF]], %[[GET_GLOB]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
 // CIR:   cir.return
-// LLVM: define internal void @[[TLS_INT_REF_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_INT_REF_INIT]]() {
 // OGCG: define internal void @[[TLS_INT_REF_INIT:.*]]()
 // OGCG:   %[[GET_REF:.*]] = call ptr @_ZTW11tls_int_dyn()
 // LLVM-BOTH:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_ref)
@@ -174,7 +198,7 @@ thread_local int tls_int_self_init = tls_int_self_init + get_i();
 // CIR-BEFORE-LPP:    cir.store {{.*}}%[[ADD]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
 // CIR-BEFORE-LPP:  }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW17tls_int_self_init", "_ZTH17tls_int_self_init"> @tls_int_self_init = #cir.int<0> : !s32i
-// CIR: cir.func internal private @[[TLS_INT_SELF_REF_INIT:.*]]() {
+// CIR: cir.func internal private @[[TLS_INT_SELF_REF_INIT]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @tls_int_self_init : !cir.ptr<!s32i>
 // CIR:   %[[GET_SELF_FROM_WRAPPER:.*]] = cir.call @_ZTW17tls_int_self_init() : () -> !cir.ptr<!s32i>
 // CIR:   %[[SELF_LOAD:.*]] = cir.load {{.*}}%[[GET_SELF_FROM_WRAPPER]] : !cir.ptr<!s32i>, !s32i
@@ -182,7 +206,7 @@ thread_local int tls_int_self_init = tls_int_self_init + get_i();
 // CIR:   %[[ADD:.*]] = cir.add nsw %[[SELF_LOAD]], %[[CALL]] : !s32i
 // CIR:   cir.store{{.*}} %[[ADD]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
 // CIR:   cir.return
-// LLVM: define internal void @[[TLS_INT_SELF_REF_INIT:.*]]() {
+// LLVM: define internal void @[[TLS_INT_SELF_REF_INIT]]() {
 // OGCG: define internal void @[[TLS_INT_SELF_REF_INIT:.*]]()
 // LLVM:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@tls_int_self_init)
 // LLVM-BOTH:   %[[GET_SELF_FROM_WRAPPER:.*]] = call ptr @_ZTW17tls_int_self_init()
@@ -209,12 +233,12 @@ extern thread_local int definitely_inited_dyn = get_i();
 // CIR-BEFORE-LPP:   cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
 // CIR-BEFORE-LPP: }
 // CIR: cir.global external tls_dyn dyn_tls_refs = <"_ZTW21definitely_inited_dyn", "_ZTH21definitely_inited_dyn"> @definitely_inited_dyn = #cir.int<0> : !s32i
-// CIR: cir.func internal private @[[DEF_INITED_DYN:.*]]() {
+// CIR: cir.func internal private @[[DEF_INITED_DYN]]() {
 // CIR:   %[[GET_GLOB:.*]] = cir.get_global thread_local @definitely_inited_dyn : !cir.ptr<!s32i>
 // CIR:   %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR:   cir.store align(4) %[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
 // CIR:   cir.return
-// LLVM: define internal void @[[DEF_INITED_DYN:.*]]() {
+// LLVM: define internal void @[[DEF_INITED_DYN]]() {
 // OGCG: define internal void @[[DEF_INITED_DYN:.*]]()
 // LLVM:   %[[GET_GLOB:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @definitely_inited_dyn)
 // LLVM-BOTH:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
diff --git a/clang/test/CIR/CodeGen/global-tls-templates.cpp b/clang/test/CIR/CodeGen/global-tls-templates.cpp
index 95cf1a26069e9..d6af8e90d2229 100644
--- a/clang/test/CIR/CodeGen/global-tls-templates.cpp
+++ b/clang/test/CIR/CodeGen/global-tls-templates.cpp
@@ -38,6 +38,7 @@ thread_local T tls_templ = {get_i()};
 // Alias: Ctor/Dtor: 
 // CIR: cir.func linkonce_odr @_ZTH9tls_templI8CtorDtorE() alias(@[[CTOR_DTOR_INIT:[^)]*]])
 // TLS Guard: Ctor/Dtor:
+// CIR: cir.global "private" linkonce_odr comdat tls_dyn @_ZGV9tls_templI8CtorDtorE = #cir.int<0> : !s64i
 
 // Wrapper: int
 // CIR-LABEL: cir.func comdat weak_odr private hidden @_ZTW9tls_templIiE() -> !cir.ptr<!s32i>
@@ -48,15 +49,28 @@ thread_local T tls_templ = {get_i()};
 
 // Alias: int
 // CIR: cir.func linkonce_odr @_ZTH9tls_templIiE() alias(@[[INT_INIT:[^)]*]])
+// TLS Guard: int
+// CIR: cir.global "private" linkonce_odr comdat tls_dyn @_ZGV9tls_templIiE = #cir.int<0> : !s64i
 
 // Global: int
 // CIR: cir.global linkonce_odr comdat tls_dyn dyn_tls_refs = <"_ZTW9tls_templIiE", "_ZTH9tls_templIiE", "_ZGV9tls_templIiE"> @_Z9tls_templIiE = #cir.int<0> : !s32i
 
 // Init Func: int
 // CIR:  cir.func internal private @[[INT_INIT]]() {
+// CIR:    %[[GET_GUARD:.*]] = cir.get_global thread_local @_ZGV9tls_templIiE : !cir.ptr<!s64i>
+// CIR:    %[[GUARD_CAST:.*]] = cir.cast bitcast %[[GET_GUARD]] : !cir.ptr<!s64i> -> !cir.ptr<!s8i>
+// CIR:    %[[LOAD_GUARD:.*]] = cir.load align(8) %[[GUARD_CAST]] : !cir.ptr<!s8i>, !s8i
+// CIR:    %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i
+// CIR:    %[[ISUNINIT:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i
+// CIR:    cir.if %[[ISUNINIT]] {
+// CIR:      %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR:      cir.store %[[ONE]], %[[GET_GUARD]] : !s64i, !cir.ptr<!s64i>
 // CIR:      %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templIiE : !cir.ptr<!s32i>
 // CIR:      %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR:      cir.store {{.*}}%[[CALL]], %[[GET_GLOB]] : !s32i, !cir.ptr<!s32i>
+// CIR:    }
+// CIR:    cir.return
+// CIR:  }
 
 
 // Global: Ctor/Dotr:
@@ -64,6 +78,14 @@ thread_local T tls_templ = {get_i()};
 
 // Init Func: Ctor/Dtor:
 // CIR: cir.func internal private @[[CTOR_DTOR_INIT]]() {
+// CIR:   %[[GET_GUARD:.*]] = cir.get_global thread_local @_ZGV9tls_templI8CtorDtorE : !cir.ptr<!s64i>
+// CIR:    %[[GUARD_CAST:.*]] = cir.cast bitcast %[[GET_GUARD]] : !cir.ptr<!s64i> -> !cir.ptr<!s8i>
+// CIR:    %[[LOAD_GUARD:.*]] = cir.load align(8) %[[GUARD_CAST]] : !cir.ptr<!s8i>, !s8i
+// CIR:    %[[ZERO:.*]] = cir.const #cir.int<0> : !s8i
+// CIR:    %[[ISUNINIT:.*]] = cir.cmp eq %[[LOAD_GUARD]], %[[ZERO]] : !s8i
+// CIR:    cir.if %[[ISUNINIT]] {
+// CIR:     %[[ONE:.*]] = cir.const #cir.int<1> : !s64i
+// CIR:     cir.store %[[ONE]], %[[GET_GUARD]] : !s64i, !cir.ptr<!s64i>
 // CIR:     %[[GET_GLOB:.*]] = cir.get_global thread_local @_Z9tls_templI8CtorDtorE : !cir.ptr<!rec_CtorDtor>
 // CIR:     %[[CALL:.*]] = cir.call @_Z5get_iv() : () -> (!s32i {llvm.noundef})
 // CIR:     cir.call @_ZN8CtorDtorC1Ei(%[[GET_GLOB]], %[[CALL]]) : (!cir.ptr<!rec_CtorDtor> {{.*}}, !s32i {llvm.noundef}) -> ()
@@ -73,9 +95,15 @@ thread_local T tls_templ = {get_i()};
 // CIR:     %[[GLOB_DECAY:.*]] = cir.cast bitcast %[[GET_GLOB:.*]] : !cir.ptr<!rec_CtorDtor> -> !cir.ptr<!void>
 // CIR:     %[[DSO_HANDLE:.*]] = cir.get_global @__dso_handle : !cir.ptr<i8>
 // CIR:     cir.call @__cxa_thread_atexit(%[[DTOR_FPTR]], %[[GLOB_DECAY]], %[[DSO_HANDLE]]) : (!cir.ptr<!cir.func<(!cir.ptr<!void>)>>, !cir.ptr<!void>, !cir.ptr<i8>) -> ()
+// CIR:   }
+// CIR:   cir.return
+// CIR: }
 
 // FIXME: These have inconsistent COMDAT with classic codegen, but we don't
 // currently specify 'comdat' with a name.
+// Guards:
+// LLVM-BOTH-DAG: @_ZGV9tls_templI8CtorDtorE = linkonce_odr thread_local global i64 0, comdat{{.*}}, align 8
+// LLVM-BOTH-DAG: @_ZGV9tls_templIiE = linkonce_odr thread_local global i64 0, comdat{{.*}}, align 8
 // Globals:
 // LLVM-BOTH-DAG: @_Z9tls_templIiE = linkonce_odr thread_local global i32 0, comdat, align 4
 // LLVM-BOTH-DAG: @_Z9tls_templI8CtorDtorE = linkonce_odr thread_local global %struct.CtorDtor zeroinitializer, comdat, align 4
@@ -121,10 +149,13 @@ thread_local T tls_templ = {get_i()};
 //    but ALWAYS treats the load/stores as i8.  This is likely a 'bug' in OGCG, but one that
 //    doesn't really matter at all.
 // LLVM-BOTH: define internal void @[[INT_INIT]]()
+// LLVM:   %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZGV9tls_templIiE)
+// LLVM:   %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 8
 // OGCG:   %[[LOAD_GUARD:.*]] = load i8, ptr @_ZGV9tls_templIiE, align 8
-// OGCG:   %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0
-// OGCG:   br i1 %[[ISUNINIT]]
+// LLVM-BOTH:   %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0
+// LLVM-BOTH:   br i1 %[[ISUNINIT]]
 //
+// LLVM:   store i64 1, ptr %[[GET_GUARD]], align 8
 // OGCG:   store i8 1, ptr @_ZGV9tls_templIiE, align 8
 // LLVM:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templIiE)
 // LLVM:   %[[CALL:.*]] = call noundef i32 @_Z5get_iv()
@@ -133,10 +164,13 @@ thread_local T tls_templ = {get_i()};
 // LLVM-BOTH:   store i32 %[[CALL]], ptr %[[GET_GLOB]]
 
 // LLVM-BOTH: define internal void @[[CTOR_DTOR_INIT]]()
+// LLVM:   %[[GET_GUARD:.*]] = call ptr @llvm.threadlocal.address.p0(ptr @_ZGV9tls_templI8CtorDtorE)
+// LLVM:   %[[LOAD_GUARD:.*]] = load i8, ptr %[[GET_GUARD]], align 8
 // OGCG:   %[[LOAD_GUARD:.*]] = load i8, ptr @_ZGV9tls_templI8CtorDtorE, align 8
-// OGCG:   %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0
-// OGCG:   br i1 %[[ISUNINIT]]
+// LLVM-BOTH:   %[[ISUNINIT:.*]] = icmp eq i{{.*}} %[[LOAD_GUARD]], 0
+// LLVM-BOTH:   br i1 %[[ISUNINIT]]
 //
+// LLVM:   store i64 1, ptr %[[GET_GUARD]], align 8
 // OGCG:  store i8 1, ptr @_ZGV9tls_templI8CtorDtorE, align 8
 //
 // LLVM:   %[[GET_GLOB:.*]] = call {{.*}}ptr @llvm.threadlocal.address.p0(ptr {{.*}}@_Z9tls_templI8CtorDtorE)

From f7f6040b327d5caa5a5f526a71f9ec66d301235a Mon Sep 17 00:00:00 2001
From: Erich Keane <ekeane@nvidia.com>
Date: Wed, 13 May 2026 20:35:58 -0700
Subject: [PATCH 22/95] [CIR]Materialize temp adjustments (#197585)

This is a pretty trivial bit of adjustments that have to happen when
emitting a materialized temporary, and is effectively a clone of classic
codegen. Our output is effectively identical (other than some minor
re-orering problems).
---
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 27 ++++++--
 .../temporary-materialization-adjust.cpp      | 61 +++++++++++++++++++
 2 files changed, 84 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp

diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 7c2729e07a0e4..d2f30a9d6562c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1999,10 +1999,29 @@ LValue CIRGenFunction::emitMaterializeTemporaryExpr(
   // Perform derived-to-base casts and/or field accesses, to get from the
   // temporary object we created (and, potentially, for which we extended
   // the lifetime) to the subobject we're binding the reference to.
-  if (!adjustments.empty()) {
-    cgm.errorNYI(e->getSourceRange(),
-                 "emitMaterializeTemporaryExpr: Adjustments");
-    return {};
+  for (SubobjectAdjustment &adjustment : llvm::reverse(adjustments)) {
+    switch (adjustment.Kind) {
+    case SubobjectAdjustment::DerivedToBaseAdjustment:
+      object =
+          getAddressOfBaseClass(object, adjustment.DerivedToBase.DerivedClass,
+                                adjustment.DerivedToBase.BasePath->path(),
+                                /*nullCheckValue=*/false, e->getExprLoc());
+      break;
+    case SubobjectAdjustment::FieldAdjustment: {
+      LValue lv = makeAddrLValue(object, e->getType(), AlignmentSource::Decl);
+      lv = emitLValueForField(lv, adjustment.Field);
+      assert(lv.isSimple() &&
+             "materialized temporary field is not a simple lvalue");
+      object = lv.getAddress();
+      break;
+    }
+    case SubobjectAdjustment::MemberPointerAdjustment: {
+      mlir::Value ptr = emitScalarExpr(adjustment.Ptr.RHS);
+      object = emitCXXMemberDataPointerAddress(
+          e, object, ptr, adjustment.Ptr.MPT, /*baseInfo=*/nullptr);
+      break;
+    }
+    }
   }
 
   return makeAddrLValue(object, m->getType(), AlignmentSource::Decl);
diff --git a/clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp b/clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp
new file mode 100644
index 0000000000000..b4761f56300b4
--- /dev/null
+++ b/clang/test/CIR/CodeGen/temporary-materialization-adjust.cpp
@@ -0,0 +1,61 @@
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=LLVM
+
+struct Base { int x; };
+
+void Field() {
+  const int &r = Base().x;
+}
+// CIR-LABEL: cir.func {{.*}}@_Z5Fieldv() 
+// CIR: %[[TEMP_ALLOCA:.*]] = cir.alloca !rec_Base, !cir.ptr<!rec_Base>, ["ref.tmp0"]
+// CIR: %[[R_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["r", init, const]
+// CIR:  %[[GET_MEM:.*]] = cir.get_member %[[TEMP_ALLOCA]][0] {name = "x"} : !cir.ptr<!rec_Base> -> !cir.ptr<!s32i>
+// CIR:  cir.store align(8) %[[GET_MEM]], %[[R_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+
+// LLVM-LABEL: define {{.*}}@_Z5Fieldv()
+// LLVM-DAG: %[[TEMP_ALLOCA:.*]] = alloca %struct.Base
+// LLVM-DAG: %[[R_ALLOCA:.*]] = alloca ptr
+// LLVM: %[[GET_MEM:.*]] = getelementptr inbounds nuw %struct.Base, ptr %[[TEMP_ALLOCA]], i32 0, i32 0
+// LLVM: store ptr %[[GET_MEM]], ptr %[[R_ALLOCA]], align 8
+
+void MemPtr(int Base::*mp) {
+  const int &r = Base().*mp;
+}
+// CIR-LABEL: cir.func {{.*}}@_Z6MemPtrM4Basei
+// CIR: %[[MP_ALLOCA:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["mp", init] {alignment = 8 : i64}
+// CIR: %[[TEMP_ALLOCA:.*]] = cir.alloca !rec_Base, !cir.ptr<!rec_Base>, ["ref.tmp0"] {alignment = 4 : i64}
+// CIR: %[[R_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["r", init, const] {alignment = 8 : i64}
+// CIR: %[[ARG_LOAD:.*]] = cir.load align(8) %[[MP_ALLOCA]] : !cir.ptr<!s64i>, !s64i
+// CIR: %[[TEMP_LOAD:.*]] = cir.cast bitcast %[[TEMP_ALLOCA]] : !cir.ptr<!rec_Base> -> !cir.ptr<!s8i>
+// CIR: %[[STRIDE:.*]] = cir.ptr_stride %[[TEMP_LOAD]], %[[ARG_LOAD]] : (!cir.ptr<!s8i>, !s64i) -> !cir.ptr<!s8i>
+// CIR: %[[TO_INT:.*]] = cir.cast bitcast %[[STRIDE:.*]] : !cir.ptr<!s8i> -> !cir.ptr<!s32i>
+// CIR: cir.store align(8) %[[TO_INT]], %[[R_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+
+// LLVM-LABEL: define {{.*}}@_Z6MemPtrM4Basei
+// LLVM: %[[MP_ALLOCA:.*]] = alloca i64
+// LLVM-DAG: %[[TEMP_ALLOCA:.*]] = alloca %struct.Base
+// LLVM-DAG: %[[R_ALLOCA:.*]] = alloca ptr
+// LLVM: %[[ARG_LOAD:.*]] = load i64, ptr %[[MP_ALLOCA]], align 8
+// LLVM: %[[STRIDE:.*]] = getelementptr {{.*}}i8, ptr %[[TEMP_ALLOCA]], i64 %[[ARG_LOAD]]
+// LLVM: store ptr %[[STRIDE]], ptr %[[R_ALLOCA]], align 8
+
+struct Derived : Base {};
+void DerivedToBase() {
+  const int &r = Derived().x;
+}
+// CIR-LABEL: cir.func {{.*}}@_Z13DerivedToBasev()
+// CIR: %[[TEMP_ALLOCA:.*]] = cir.alloca !rec_Derived, !cir.ptr<!rec_Derived>, ["ref.tmp0"] {alignment = 4 : i64}
+// CIR: %[[R_ALLOCA:.*]] = cir.alloca !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>, ["r", init, const] {alignment = 8 : i64}
+// CIR: %[[BASE:.*]] = cir.base_class_addr %[[TEMP_ALLOCA]] : !cir.ptr<!rec_Derived> nonnull [0] -> !cir.ptr<!rec_Base>
+// CIR: %[[GET_MEM:.*]] = cir.get_member %[[BASE]][0] {name = "x"} : !cir.ptr<!rec_Base> -> !cir.ptr<!s32i>
+// CIR: cir.store align(8) %[[GET_MEM]], %[[R_ALLOCA]] : !cir.ptr<!s32i>, !cir.ptr<!cir.ptr<!s32i>>
+
+// LLVM-LABEL: define {{.*}}@_Z13DerivedToBasev
+// LLVM-DAG: %[[TEMP_ALLOCA:.*]] = alloca %struct.Derived
+// LLVM-DAG: %[[R_ALLOCA:.*]] = alloca ptr
+// LLVM: %[[GET_MEM:.*]] = getelementptr inbounds nuw %struct.Base, ptr %[[TEMP_ALLOCA]], i32 0, i32 0
+// LLVM: store ptr %[[GET_MEM]], ptr %[[R_ALLOCA]], align 8

From b638763d6c8b5227f1ba9addd9f5964fa796385b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 13 May 2026 20:40:43 -0700
Subject: [PATCH 23/95] [flang][cuda] Use wider cudaMemcpy2D rows for
 descriptor transfers (#197563)

---
 flang-rt/lib/cuda/memory.cpp               |  58 ++++++++-
 flang-rt/unittests/Runtime/CUDA/Memory.cpp | 144 +++++++++++++++++++++
 2 files changed, 198 insertions(+), 4 deletions(-)

diff --git a/flang-rt/lib/cuda/memory.cpp b/flang-rt/lib/cuda/memory.cpp
index 575d7bbc9c29a..05302ee47e093 100644
--- a/flang-rt/lib/cuda/memory.cpp
+++ b/flang-rt/lib/cuda/memory.cpp
@@ -30,9 +30,9 @@ struct Memcpy2DLayout {
   std::size_t pitchBytes;
 };
 
-// Get cudaMemcpy2D layout information if both descriptors have equal element
-// counts and regular positive-stride layouts. Returns a nullopt otherwise to
-// fallback on the runtime assignment.
+// Get cudaMemcpy2D layout information for a descriptor that can be represented
+// as fixed-pitch rows of widthBytes. Returns nullopt for layouts that need the
+// general runtime assignment path.
 static std::optional<Memcpy2DLayout> GetMemcpy2DLayout(
     const Descriptor &desc, std::size_t widthBytes) {
   if (desc.rank() == 0 || desc.Elements() == 0) {
@@ -84,13 +84,63 @@ static std::optional<Memcpy2DLayout> GetMemcpy2DLayout(
   return layout;
 }
 
+// Collect candidate row widths from the descriptor's leading contiguous
+// dimensions, starting with one element.
+static int GetContiguousLeadingBytes(
+    const Descriptor &desc, std::size_t *bytes) {
+  const auto elemBytes = desc.ElementBytes();
+  if (elemBytes == 0) {
+    return 0;
+  }
+
+  int count = 0;
+  bytes[count++] = elemBytes;
+  std::size_t contiguousBytes = elemBytes;
+  for (int j = 0; j < desc.rank(); ++j) {
+    const auto &dim = desc.GetDimension(j);
+    if (dim.Extent() != 1 &&
+        (dim.ByteStride() < 0 ||
+            static_cast<std::size_t>(dim.ByteStride()) != contiguousBytes)) {
+      break;
+    }
+    contiguousBytes *= dim.Extent();
+    if (contiguousBytes != bytes[count - 1]) {
+      bytes[count++] = contiguousBytes;
+    }
+  }
+  return count;
+}
+
+// Choose the largest row width that is contiguous in both descriptors, so
+// leading-dimension slices can be copied as wider cudaMemcpy2D rows.
+static std::size_t GetMemcpy2DWidthBytes(
+    const Descriptor &dst, const Descriptor &src) {
+  std::size_t dstBytes[maxRank + 1];
+  std::size_t srcBytes[maxRank + 1];
+  const int dstCount = GetContiguousLeadingBytes(dst, dstBytes);
+  const int srcCount = GetContiguousLeadingBytes(src, srcBytes);
+  for (int j = dstCount - 1; j >= 0; --j) {
+    for (int k = srcCount - 1; k >= 0; --k) {
+      if (dstBytes[j] == srcBytes[k]) {
+        return dstBytes[j];
+      }
+    }
+  }
+  return 0;
+}
+
+// Try to use cudaMemcpy2D for a memcpy of two descriptors, returning true if
+// successful. False if the 2D data transfer is not possible.
 static bool DoMemcpy2D(const Descriptor &dst, const Descriptor &src,
     cudaMemcpyKind kind, const char *sourceFile, int sourceLine) {
   if (dst.ElementBytes() != src.ElementBytes() ||
       dst.Elements() != src.Elements())
     return false;
 
-  std::size_t widthBytes = dst.ElementBytes();
+  std::size_t widthBytes = GetMemcpy2DWidthBytes(dst, src);
+  if (widthBytes == 0) {
+    return false;
+  }
   auto dstLayout = GetMemcpy2DLayout(dst, widthBytes);
   auto srcLayout = GetMemcpy2DLayout(src, widthBytes);
   if (!dstLayout || !srcLayout) {
diff --git a/flang-rt/unittests/Runtime/CUDA/Memory.cpp b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
index 907df0ffb985a..a3aceb884cd83 100644
--- a/flang-rt/unittests/Runtime/CUDA/Memory.cpp
+++ b/flang-rt/unittests/Runtime/CUDA/Memory.cpp
@@ -160,3 +160,147 @@ TEST(MemoryCUFTest, CUFDataTransferDescDescStrided) {
     EXPECT_EQ(recvStorage[i * stride + 1], -2);
   }
 }
+
+TEST(MemoryCUFTest, CUFDataTransferDescDescLeadingSliceRank2) {
+  using Fortran::common::TypeCategory;
+  static constexpr int nx{8};
+  static constexpr int ny{4};
+  static constexpr int elements{nx * ny};
+  SubscriptValue sliceExtent[]{nx - 2, ny};
+
+  std::int32_t hostStorage[elements]{};
+  for (int j{0}; j < ny; ++j) {
+    for (int i{1}; i < nx - 1; ++i) {
+      hostStorage[i + nx * j] = i + 10 * j;
+    }
+  }
+
+  std::int32_t *devStorage{static_cast<std::int32_t *>(RTNAME(CUFMemAlloc)(
+      sizeof(hostStorage), kMemTypeDevice, __FILE__, __LINE__))};
+  ASSERT_NE(devStorage, nullptr);
+  cudaMemset(devStorage, 0xff, sizeof(hostStorage));
+
+  StaticDescriptor<2> hostStaticDesc;
+  Descriptor &hostDesc{hostStaticDesc.descriptor()};
+  hostDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t),
+      hostStorage + 1, 2, sliceExtent);
+  hostDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t));
+  hostDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t));
+
+  StaticDescriptor<2> devStaticDesc;
+  Descriptor &devDesc{devStaticDesc.descriptor()};
+  devDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t),
+      devStorage + 1, 2, sliceExtent);
+  devDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t));
+  devDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t));
+
+  RTNAME(CUFDataTransferDescDesc)
+  (&devDesc, &hostDesc, kHostToDevice, __FILE__, __LINE__);
+
+  std::int32_t result[elements]{};
+  RTNAME(CUFDataTransferPtrPtr)
+  (result, devStorage, sizeof(result), kDeviceToHost, __FILE__, __LINE__);
+
+  std::int32_t recvStorage[elements]{};
+  for (int i{0}; i < elements; ++i) {
+    recvStorage[i] = -2;
+  }
+  StaticDescriptor<2> recvStaticDesc;
+  Descriptor &recvDesc{recvStaticDesc.descriptor()};
+  recvDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t),
+      recvStorage + 1, 2, sliceExtent);
+  recvDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t));
+  recvDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t));
+  RTNAME(CUFDataTransferDescDesc)
+  (&recvDesc, &devDesc, kDeviceToHost, __FILE__, __LINE__);
+
+  RTNAME(CUFMemFree)(devStorage, kMemTypeDevice, __FILE__, __LINE__);
+
+  for (int j{0}; j < ny; ++j) {
+    EXPECT_EQ(result[nx * j], -1);
+    EXPECT_EQ(result[nx - 1 + nx * j], -1);
+    EXPECT_EQ(recvStorage[nx * j], -2);
+    EXPECT_EQ(recvStorage[nx - 1 + nx * j], -2);
+    for (int i{1}; i < nx - 1; ++i) {
+      const int index{i + nx * j};
+      EXPECT_EQ(result[index], hostStorage[index]);
+      EXPECT_EQ(recvStorage[index], hostStorage[index]);
+    }
+  }
+}
+
+TEST(MemoryCUFTest, CUFDataTransferDescDescLeadingSlice) {
+  using Fortran::common::TypeCategory;
+  static constexpr int nx{8};
+  static constexpr int ny{4};
+  static constexpr int nz{3};
+  static constexpr int elements{nx * ny * nz};
+  SubscriptValue sliceExtent[]{nx - 2, ny, nz};
+
+  std::int32_t hostStorage[elements]{};
+  for (int k{0}; k < nz; ++k) {
+    for (int j{0}; j < ny; ++j) {
+      for (int i{1}; i < nx - 1; ++i) {
+        hostStorage[i + nx * (j + ny * k)] = i + 10 * j + 100 * k;
+      }
+    }
+  }
+
+  std::int32_t *devStorage{static_cast<std::int32_t *>(RTNAME(CUFMemAlloc)(
+      sizeof(hostStorage), kMemTypeDevice, __FILE__, __LINE__))};
+  ASSERT_NE(devStorage, nullptr);
+  cudaMemset(devStorage, 0xff, sizeof(hostStorage));
+
+  StaticDescriptor<3> hostStaticDesc;
+  Descriptor &hostDesc{hostStaticDesc.descriptor()};
+  hostDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t),
+      hostStorage + 1, 3, sliceExtent);
+  hostDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t));
+  hostDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t));
+  hostDesc.GetDimension(2).SetByteStride(nx * ny * sizeof(std::int32_t));
+
+  StaticDescriptor<3> devStaticDesc;
+  Descriptor &devDesc{devStaticDesc.descriptor()};
+  devDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t),
+      devStorage + 1, 3, sliceExtent);
+  devDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t));
+  devDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t));
+  devDesc.GetDimension(2).SetByteStride(nx * ny * sizeof(std::int32_t));
+
+  RTNAME(CUFDataTransferDescDesc)
+  (&devDesc, &hostDesc, kHostToDevice, __FILE__, __LINE__);
+
+  std::int32_t result[elements]{};
+  RTNAME(CUFDataTransferPtrPtr)
+  (result, devStorage, sizeof(result), kDeviceToHost, __FILE__, __LINE__);
+
+  std::int32_t recvStorage[elements]{};
+  for (int i{0}; i < elements; ++i) {
+    recvStorage[i] = -2;
+  }
+  StaticDescriptor<3> recvStaticDesc;
+  Descriptor &recvDesc{recvStaticDesc.descriptor()};
+  recvDesc.Establish(TypeCode{TypeCategory::Integer, 4}, sizeof(std::int32_t),
+      recvStorage + 1, 3, sliceExtent);
+  recvDesc.GetDimension(0).SetByteStride(sizeof(std::int32_t));
+  recvDesc.GetDimension(1).SetByteStride(nx * sizeof(std::int32_t));
+  recvDesc.GetDimension(2).SetByteStride(nx * ny * sizeof(std::int32_t));
+  RTNAME(CUFDataTransferDescDesc)
+  (&recvDesc, &devDesc, kDeviceToHost, __FILE__, __LINE__);
+
+  RTNAME(CUFMemFree)(devStorage, kMemTypeDevice, __FILE__, __LINE__);
+
+  for (int k{0}; k < nz; ++k) {
+    for (int j{0}; j < ny; ++j) {
+      EXPECT_EQ(result[nx * (j + ny * k)], -1);
+      EXPECT_EQ(result[nx - 1 + nx * (j + ny * k)], -1);
+      EXPECT_EQ(recvStorage[nx * (j + ny * k)], -2);
+      EXPECT_EQ(recvStorage[nx - 1 + nx * (j + ny * k)], -2);
+      for (int i{1}; i < nx - 1; ++i) {
+        const int index{i + nx * (j + ny * k)};
+        EXPECT_EQ(result[index], hostStorage[index]);
+        EXPECT_EQ(recvStorage[index], hostStorage[index]);
+      }
+    }
+  }
+}

From 923a29a1b4ba855c7504128ed3cfe177d709d73d Mon Sep 17 00:00:00 2001
From: "Yaxun (Sam) Liu" <yaxun.liu@amd.com>
Date: Wed, 13 May 2026 23:54:58 -0400
Subject: [PATCH 24/95] [compiler-rt][profile][test] Match clang_rt.profile CRT
 model on MSVC (#197474)

On MSVC, Profile-* tests must link with the same CRT model as the
clang_rt.profile static archive they exercise. When that archive pulls
in RTInterception / RTSanitizerCommon object libraries, those are built
with MultiThreadedDLL (/MD), so the .objs reference `__imp_*` symbols.
The test binary defaults to /MT and fails to link with LNK2019
(`__imp__stricmp` from `interception_win.cpp`) and LNK4098 default-lib
conflicts.

Match the DLL CRT on the test side so test executables and the static
archive use the same runtime. The change is gated on
`COMPILER_RT_HAS_INTERCEPTION` and `!COMPILER_RT_PROFILE_BAREMETAL`, so
configurations that don't pull interception into profile are unaffected.

Split out as NFC from #177665 per review feedback.
---
 compiler-rt/test/profile/CMakeLists.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/compiler-rt/test/profile/CMakeLists.txt b/compiler-rt/test/profile/CMakeLists.txt
index a6d8a9684508d..213a05032ed80 100644
--- a/compiler-rt/test/profile/CMakeLists.txt
+++ b/compiler-rt/test/profile/CMakeLists.txt
@@ -22,6 +22,17 @@ pythonize_bool(LLVM_ENABLE_CURL)
 foreach(arch ${PROFILE_TEST_ARCH})
   set(PROFILE_TEST_TARGET_ARCH ${arch})
   get_test_cc_for_arch(${arch} PROFILE_TEST_TARGET_CC PROFILE_TEST_TARGET_CFLAGS)
+  # On MSVC, Profile-* tests must link with the same CRT model as the
+  # clang_rt.profile static archive they exercise. When that archive pulls
+  # in RTInterception / RTSanitizerCommon object libraries, those are built
+  # with MultiThreadedDLL (/MD), so the .objs reference __imp_* symbols;
+  # the test binary defaults to /MT and fails to link (LNK2019 __imp__stricmp
+  # from interception_win.cpp, LNK4098 default-lib conflicts). Match the
+  # DLL CRT here so test executables link against the same runtime.
+  if(MSVC AND COMPILER_RT_HAS_INTERCEPTION AND NOT COMPILER_RT_PROFILE_BAREMETAL)
+    string(APPEND PROFILE_TEST_TARGET_CFLAGS
+           " -D_MT -D_DLL -Wl,-nodefaultlib:libcmt,-defaultlib:msvcrt,-defaultlib:oldnames")
+  endif()
   set(CONFIG_NAME Profile-${arch})
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in

From 98f2f8ccb7fb0ecd13390b7ed5252679912ae1e8 Mon Sep 17 00:00:00 2001
From: Zeyi Xu <mitchell.xu2@gmail.com>
Date: Thu, 14 May 2026 12:32:14 +0800
Subject: [PATCH 25/95] [clang-tidy] Remove 80 char limit checking in CI. NFC.
 (#197609)

The
[RFC](https://discourse.llvm.org/t/rfc-remove-80-column-limit-in-documentation-files/89678/41)
on removing 80 columns limit got accepted. So we should no longer
enforce that rule in clang-tidy's code-linter workflow.
---
 clang-tools-extra/clang-tidy/doc8.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang-tools-extra/clang-tidy/doc8.ini b/clang-tools-extra/clang-tidy/doc8.ini
index 14cac344989b3..514e75ad01df5 100644
--- a/clang-tools-extra/clang-tidy/doc8.ini
+++ b/clang-tools-extra/clang-tidy/doc8.ini
@@ -1,2 +1,3 @@
 [doc8]
 ignore-path = clang-tools-extra/docs/clang-tidy/Integrations.rst
+ignore = D001

From e2b50489f9612c720e7e17c5652972194917c632 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Wed, 13 May 2026 22:04:34 -0700
Subject: [PATCH 26/95] [AMDGPU] Validate forced lit() immediate (#196623)

Right now it takes validation path of an inline constant if fits
even though it is forced to literal encoding.
---
 .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp   |  8 +++++++-
 llvm/test/MC/AMDGPU/literals.s                    | 15 +++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index cc8cea8f8411c..e15cc2c072334 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -86,6 +86,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     bool hasFPModifiers() const { return Abs || Neg; }
     bool hasIntModifiers() const { return Sext; }
     bool hasModifiers() const { return hasFPModifiers() || hasIntModifiers(); }
+    bool isForcedLit() const { return Lit == LitModifier::Lit; }
     bool isForcedLit64() const { return Lit == LitModifier::Lit64; }
 
     int64_t getFPModifiersOperand() const {
@@ -1053,6 +1054,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return getModifiers().hasIntModifiers();
   }
 
+  bool isForcedLit() const {
+    return isImmLiteral() && getModifiers().isForcedLit();
+  }
+
   bool isForcedLit64() const {
     return isImmLiteral() && getModifiers().isForcedLit64();
   }
@@ -5133,11 +5138,12 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
       Imm = getLitValue(MO.getExpr());
 
     bool IsAnotherLiteral = false;
+    bool IsForcedLit = findMCOperand(Operands, OpIdx).isForcedLit();
     bool IsForcedLit64 = findMCOperand(Operands, OpIdx).isForcedLit64();
     if (!Imm.has_value()) {
       // Literal value not known, so we conservately assume it's different.
       IsAnotherLiteral = true;
-    } else if (IsForcedLit64 || !isInlineConstant(Inst, OpIdx)) {
+    } else if (IsForcedLit || IsForcedLit64 || !isInlineConstant(Inst, OpIdx)) {
       uint64_t Value = *Imm;
       bool IsForcedFP64 =
           Desc.operands()[OpIdx].OperandType == AMDGPU::OPERAND_KIMM64 ||
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 273ed630e104f..e14d55ff62757 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -494,7 +494,7 @@ v_pk_add_u16 v5, exec_lo, lit(1.0)
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x00,0x00,0x80,0x3f]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x00,0x00,0x80,0x3f]
 // NOCI: :[[@LINE-3]]:1: error: instruction not supported on this GPU (bonaire): v_pk_add_u16
-// NOGFX9: :[[@LINE-4]]:31: error: invalid operand (violates constant bus restrictions)
+// NOGFX9: :[[@LINE-4]]:31: error: literal operands are not supported
 // NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU (tahiti): v_pk_add_u16
 // NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU (tonga): v_pk_add_u16
 
@@ -980,7 +980,7 @@ v_pk_add_u16 v5, exec_lo, lit(1)
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x01,0x00,0x00,0x00]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x1a,0x01,0x00,0x00,0x00]
 // NOCI: :[[@LINE-3]]:1: error: instruction not supported on this GPU (bonaire): v_pk_add_u16
-// NOGFX9: :[[@LINE-4]]:31: error: invalid operand (violates constant bus restrictions)
+// NOGFX9: :[[@LINE-4]]:31: error: literal operands are not supported
 // NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU (tahiti): v_pk_add_u16
 // NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU (tonga): v_pk_add_u16
 
@@ -1979,12 +1979,11 @@ v_add_nc_u64 v[0:1], v[0:1], lit64(1)
 // NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_add_f64 v[0:1], v[0:1], lit(1)
-// GFX11: v_add_f64 v[0:1], v[0:1], lit(0x1)      ; encoding: [0x00,0x00,0x27,0xd7,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
-// GFX12: v_add_f64_e64 v[0:1], v[0:1], lit(0x1)  ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
-// GFX1250-ASM: v_add_f64_e64 v[0:1], v[0:1], lit(0x1)  ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
-// GFX1250-DIS: v_add_f64_e64 v[0:1], v[0:1], 0x1       ; encoding: [0x00,0x00,0x02,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
-// GFX89: v_add_f64 v[0:1], v[0:1], lit(0x1)      ; encoding: [0x00,0x00,0x80,0xd2,0x00,0xff,0x01,0x00]
-// SICI: v_add_f64 v[0:1], v[0:1], lit(0x1)      ; encoding: [0x00,0x00,0xc8,0xd2,0x00,0xff,0x01,0x00]
+// NOGFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:31: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:31: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:31: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:31: error: invalid operand for instruction
 
 v_add_f64 v[0:1], v[0:1], lit(1.0)
 // NOGFX11: :[[@LINE-1]]:31: error: invalid operand for instruction

From 4f60fb9a1ec66e551307e460faf4fc01c622c4be Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw@nvidia.com>
Date: Wed, 13 May 2026 22:07:22 -0700
Subject: [PATCH 27/95] [flang][cuda] Honor !dir$ ignore_tkr(m) under
 -gpu=mem:{unified,managed} (#197518)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A device-typed dummy with `!dir$ ignore_tkr(m)` is meant to be an
overload discriminator (only selected for actuals with an explicit
`device/managed/unified` attribute). Skip the host->device relaxation in
AreCompatibleCUDADataAttrs when `IgnoreTKR::Managed` is set so
unattributed host actuals no longer bind to such a dummy.

Also document the §3.2.3 matching distance table next to
GetMatchingDistance and add LIT tests for the full Table 2 grid
and the ignore_tkr(m) carve-out.
---
 flang/docs/Directives.md                      | 36 ++++++++
 flang/lib/Semantics/expression.cpp            | 25 +++++-
 flang/lib/Support/Fortran.cpp                 | 18 ++--
 .../test/Semantics/cuf-ignore-tkr-m-error.cuf | 32 +++++++
 .../Semantics/cuf-ignore-tkr-m-generic.cuf    | 56 ++++++++++++
 .../test/Semantics/cuf-matching-distance.cuf  | 90 +++++++++++++++++++
 6 files changed, 250 insertions(+), 7 deletions(-)
 create mode 100644 flang/test/Semantics/cuf-ignore-tkr-m-error.cuf
 create mode 100644 flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf
 create mode 100644 flang/test/Semantics/cuf-matching-distance.cuf

diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index 3ff56dbded1d7..385d44b7ced07 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -29,6 +29,42 @@ A list of non-standard directives supported by Flang
   argument's descriptor and passed as a raw pointer.
   The letter (P) ignores pointer and allocatable matching, so that one can pass
   an allocatable array to routine with pointer array argument and vice versa.
+  The letter (M) disables matching of the actual argument's CUDA storage
+  (managed/unified) against the dummy's. Its main use is in host modules that
+  overload the same routine with both a host-typed and a `device`-typed
+  specific: placing (M) on the device-typed dummy turns that specific into an
+  overload discriminator. Under `-gpu=mem:unified` or `-gpu=mem:managed`, an
+  unattributed host actual is normally allowed to bind to a `device` dummy
+  (the host-to-device attribute check is relaxed). (M) on that dummy opts it
+  out of the relaxation: an unattributed host actual then binds to the
+  host-typed specific in the same overload set, while actuals with an
+  explicit `device`, `managed`, or `unified` attribute continue to bind to
+  the device-typed specific. For example:
+```
+  interface compute
+    module procedure compute_host
+    module procedure compute_device
+  end interface
+contains
+  subroutine compute_host(alpha)
+    real :: alpha
+  end
+  subroutine compute_device(alpha)
+    real, device :: alpha
+    !dir$ ignore_tkr(m) alpha
+  end
+  ! ...
+  real :: a            ! plain host scalar
+  real, device :: d    ! device scalar
+  call compute(a)      ! always binds to compute_host
+  call compute(d)      ! always binds to compute_device
+```
+  For contrast: without `ignore_tkr(m)` on `compute_device`,
+  `call compute(a)` compiled with `-gpu=mem:unified` would instead resolve
+  to `compute_device`, because the matching rules let `a` bind to the
+  device dummy and rank it as a closer match than the host one (see the
+  "Attributed Argument Matching Distance Values" table in section 3.2.3
+  of the CUDA Fortran Programming Guide).
   For example, if one wanted to call a "set all bytes to zero" utility that
   could be applied to arrays of any type or rank:
 ```
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index dad401f0baa74..8ee0613bdfc5f 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2839,8 +2839,29 @@ static int CompareCudaMatchingDistance(
   return 0;
 }
 
-// Compute the matching distance as described in section 3.2.3 of the CUDA
-// Fortran references.
+// Compute the matching distance for one (dummy, actual) pair as described
+// in section 3.2.3 ("Table 2: Attributed Argument Matching Distance Values")
+// of the CUDA Fortran Programming Guide. The column applied for the actual
+// depends on its CUDA data attribute and (for unattributed actuals) on the
+// active -gpu=mem:{unified,managed} mode.
+//
+// Distance values returned (smaller is a better match; INF means
+// incompatible and disqualifies the candidate):
+//
+//                       Actual argument attribute
+//                 None                              ACC      gpu=    gpu=
+//   Dummy attr   (Host)  Device  Managed Unified  use_dev  unified  managed
+//   ----------+--------+-------+--------+-------+--------+--------+--------+
+//   None(host)|    0   |  INF  |   3    |   3   |   3    |   3    |   3    |
+//   Device    |   INF  |   0   |   2    |   2   |   0    |   2    |   2    |
+//   Managed   |   INF  |  INF  |   0    |   1   |  INF   |   1    |   0    |
+//   Unified   |   INF  |  INF  |   1    |   0   |  INF   |   0    |   1    |
+//
+// In addition: a dummy declared TYPE(*) (assumed-size/rank opaque buffer)
+// is "CUDA address space agnostic" and accepts any attributed actual at a
+// non-zero distance (3) so an explicit Device overload still wins. The
+// "ACC use_dev" column applies to actuals appearing in a surrounding
+// ACC HOST_DATA USE_DEVICE clause.
 static int GetMatchingDistance(const common::LanguageFeatureControl &features,
     const characteristics::DummyArgument &dummy,
     const std::optional<ActualArgument> &actual) {
diff --git a/flang/lib/Support/Fortran.cpp b/flang/lib/Support/Fortran.cpp
index d38e7dc051562..39a3d64a464fc 100644
--- a/flang/lib/Support/Fortran.cpp
+++ b/flang/lib/Support/Fortran.cpp
@@ -148,11 +148,19 @@ bool AreCompatibleCUDADataAttrs(std::optional<CUDADataAttr> x,
       }
     } else {
       if (*x == CUDADataAttr::Device) {
-        if ((y &&
-                (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified ||
-                    *y == CUDADataAttr::Shared ||
-                    *y == CUDADataAttr::Constant)) ||
-            (!y && (isCudaUnified || isCudaManaged))) {
+        if (y &&
+            (*y == CUDADataAttr::Managed || *y == CUDADataAttr::Unified ||
+                *y == CUDADataAttr::Shared || *y == CUDADataAttr::Constant)) {
+          return true;
+        }
+        // A device dummy carrying !dir$ ignore_tkr(m) opts out of the
+        // -gpu=mem:{unified,managed} relaxation that would otherwise let
+        // an unattributed host actual bind to it. The (m) letter is used
+        // by host modules to mark device-typed dummies as overload
+        // discriminators that should only accept actuals with an explicit
+        // device/managed/unified attribute.
+        if (!y && (isCudaUnified || isCudaManaged) &&
+            !ignoreTKR.test(IgnoreTKR::Managed)) {
           return true;
         }
       } else if (*x == CUDADataAttr::Managed) {
diff --git a/flang/test/Semantics/cuf-ignore-tkr-m-error.cuf b/flang/test/Semantics/cuf-ignore-tkr-m-error.cuf
new file mode 100644
index 0000000000000..05863c3e1ca50
--- /dev/null
+++ b/flang/test/Semantics/cuf-ignore-tkr-m-error.cuf
@@ -0,0 +1,32 @@
+! RUN: not bbc -emit-hlfir -fcuda -gpu=unified %s -o /dev/null 2>&1 | FileCheck %s
+
+! A device-attributed dummy carrying !dir$ ignore_tkr(m) opts out of
+! the -gpu=mem:unified relaxation that would otherwise let an
+! unattributed host actual bind to it. If a generic exposes ONLY such
+! a specific, no viable candidate remains for a plain host actual and
+! the call must be diagnosed.
+!
+! (cuf14.cuf and cuf-ignore-tkr-m-generic.cuf cover the contrasting
+! cases where the device dummy has no ignore_tkr(m) and either a host
+! specific is present alongside it, or it is the only specific. Those
+! flows are unaffected by this carve-out.)
+
+module m
+  interface gen_only_device
+    module procedure sub_device_ignore_m
+  end interface
+
+contains
+  subroutine sub_device_ignore_m(x)
+    real, device :: x
+    !dir$ ignore_tkr(m) x
+  end subroutine
+end module
+
+subroutine caller
+  use m
+  real :: a
+  call gen_only_device(a)
+end subroutine
+
+! CHECK: No specific subroutine of generic 'gen_only_device' matches the actual arguments
diff --git a/flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf b/flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf
new file mode 100644
index 0000000000000..62e54b35a38a2
--- /dev/null
+++ b/flang/test/Semantics/cuf-ignore-tkr-m-generic.cuf
@@ -0,0 +1,56 @@
+! RUN: bbc -emit-hlfir -fcuda -gpu=unified %s -o - | FileCheck %s
+
+! Under -gpu=mem:unified, a device-attributed dummy that carries
+! !dir$ ignore_tkr(m) opts out of the relaxation that lets an
+! unattributed host actual bind to a device dummy. Such dummies act
+! purely as overload discriminators -- the (m) indicates they should
+! only be selected when the actual has an explicit
+! device/managed/unified attribute. When the same generic also has a
+! plain host-typed specific, that host specific must therefore be
+! selected for an unattributed host actual.
+!
+! For comparison, a device specific without ignore_tkr(m) still wins
+! over a host specific for an unattributed host actual under
+! -gpu=mem:unified (this is what cuf14.cuf already covers, and is
+! reproduced here for contrast).
+
+module m
+  interface gen_pair
+    module procedure sub_host
+    module procedure sub_device_ignore_m
+  end interface
+
+  interface gen_pair_no_ignore
+    module procedure sub_host
+    module procedure sub_device_plain
+  end interface
+
+contains
+  subroutine sub_host(x)
+    real :: x
+  end subroutine
+
+  subroutine sub_device_ignore_m(x)
+    real, device :: x
+    !dir$ ignore_tkr(m) x
+  end subroutine
+
+  subroutine sub_device_plain(x)
+    real, device :: x
+  end subroutine
+end module
+
+subroutine caller
+  use m
+  real :: a, b
+  ! ignore_tkr(m) on sub_device_ignore_m's dummy opts that specific
+  ! out of accepting an unattributed host actual -> sub_host wins.
+  call gen_pair(a)
+  ! No ignore_tkr(m); the device specific accepts the host actual and
+  ! is preferred over the host specific (this case is unchanged).
+  call gen_pair_no_ignore(b)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPcaller
+! CHECK: fir.call @_QMmPsub_host
+! CHECK: fir.call @_QMmPsub_device_plain
diff --git a/flang/test/Semantics/cuf-matching-distance.cuf b/flang/test/Semantics/cuf-matching-distance.cuf
new file mode 100644
index 0000000000000..a852aff259ee8
--- /dev/null
+++ b/flang/test/Semantics/cuf-matching-distance.cuf
@@ -0,0 +1,90 @@
+! RUN: bbc -emit-hlfir -fcuda            %s -o - | FileCheck %s --check-prefix=NORM
+! RUN: bbc -emit-hlfir -fcuda -gpu=unified %s -o - | FileCheck %s --check-prefix=UNI
+! RUN: bbc -emit-hlfir -fcuda -gpu=managed %s -o - | FileCheck %s --check-prefix=MAN
+
+! Comprehensive coverage of Table 2 ("Attributed Argument Matching
+! Distance Values") from CUDA Fortran Programming Guide §3.2.3.
+!
+! One generic exposes a host, device, managed, and unified specific.
+! Each call site picks the winning specific based on the actual's CUDA
+! attribute (or, for unattributed actuals, the active -gpu=mem mode).
+!
+!                       Actual argument attribute
+!                 None                              gpu=    gpu=
+!   Dummy attr   (Host)  Device  Managed  Unified  unified managed
+!   ----------+--------+-------+--------+--------+--------+--------+
+!   None(host)|    0   |  INF  |   3    |   3    |   3    |   3    |
+!   Device    |   INF  |   0   |   2    |   2    |   2    |   2    |
+!   Managed   |   INF  |  INF  |   0    |   1    |   1    |   0    |
+!   Unified   |   INF  |  INF  |   1    |   0    |   0    |   1    |
+
+module m
+  interface gen
+    module procedure sub_host
+    module procedure sub_device
+    module procedure sub_managed
+    module procedure sub_unified
+  end interface
+contains
+  subroutine sub_host(x)
+    integer :: x(:)
+  end subroutine
+  subroutine sub_device(x)
+    integer, device :: x(:)
+  end subroutine
+  subroutine sub_managed(x)
+    integer, managed :: x(:)
+  end subroutine
+  subroutine sub_unified(x)
+    integer, unified :: x(:)
+  end subroutine
+end module
+
+! Test driver: one call per actual attribute. Each compilation mode
+! (no flag, -gpu=unified, -gpu=managed) yields a different winner for
+! the unattributed allocatable (the "None" actual).
+subroutine driver
+  use m
+  integer, allocatable          :: act_none(:)
+  integer, device, allocatable  :: act_dev(:)
+  integer, managed, allocatable :: act_man(:)
+  integer, unified, allocatable :: act_uni(:)
+  allocate(act_none(4), act_dev(4), act_man(4), act_uni(4))
+
+  call gen(act_none)
+  call gen(act_dev)
+  call gen(act_man)
+  call gen(act_uni)
+end subroutine
+
+! Without any -gpu=mem mode, an unattributed actual matches the host
+! specific (Table 2 column "Actual None (host)"). Explicit Device,
+! Managed and Unified actuals each select their corresponding
+! specific (distance 0 down the diagonal).
+! NORM-LABEL: func.func @_QPdriver
+! NORM:      fir.call @_QMmPsub_host
+! NORM:      fir.call @_QMmPsub_device
+! NORM:      fir.call @_QMmPsub_managed
+! NORM:      fir.call @_QMmPsub_unified
+
+! Under -gpu=mem:unified, an unattributed actual matches the unified
+! specific (Table 2 column "Actual None (gpu=mem:unified)": Unified=0
+! beats Managed=1, Device=2, Host=3). Explicitly attributed actuals
+! still pick their exact-match specific.
+! UNI-LABEL: func.func @_QPdriver
+! UNI:       fir.call @_QMmPsub_unified
+! UNI:       fir.call @_QMmPsub_device
+! UNI:       fir.call @_QMmPsub_managed
+! UNI:       fir.call @_QMmPsub_unified
+
+! Under -gpu=mem:managed, an unattributed actual matches the managed
+! specific (Table 2 column "Actual None (gpu=mem:managed)": Managed=0
+! beats Unified=1, Device=2, Host=3). Explicit Device/Managed/Unified
+! actuals are unaffected by the -gpu mode and pick their exact-match
+! specific -- in particular, an explicit Unified actual still binds
+! to the Unified specific (Unified=0 < Managed=1), matching Table 2.
+! MAN-LABEL: func.func @_QPdriver
+! MAN:       fir.call @_QMmPsub_managed
+! MAN:       fir.call @_QMmPsub_device
+! MAN:       fir.call @_QMmPsub_managed
+! MAN:       fir.call @_QMmPsub_unified

From 83ae5ccb300eea674352716239ee57c959b3e14c Mon Sep 17 00:00:00 2001
From: Andre Kuhlenschmidt <andre.kuhlenschmidt@gmail.com>
Date: Wed, 13 May 2026 22:18:45 -0700
Subject: [PATCH 28/95] [flang][openacc] allow duplicate data sharing clauses
 (#197019)

This PR allows duplicate OpenACC `private` and `firstprivate` clauses.
While maintaining the restriction on `reduction` clauses.
---
 flang/docs/OpenACC.md                         |   2 +-
 flang/include/flang/Semantics/semantics.h     |  11 ++
 flang/lib/Semantics/resolve-directives.cpp    |  43 +++---
 flang/lib/Semantics/rewrite-parse-tree.cpp    |  10 ++
 .../test/Lower/OpenACC/acc-dedup-private.f90  |  63 +++++++++
 flang/test/Parser/acc-dedup-unparse.f90       |  28 ++++
 .../OpenACC/acc-dataclause-dedup.f90          | 122 ++++++++++++++++++
 7 files changed, 262 insertions(+), 17 deletions(-)
 create mode 100644 flang/test/Lower/OpenACC/acc-dedup-private.f90
 create mode 100644 flang/test/Parser/acc-dedup-unparse.f90
 create mode 100644 flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90

diff --git a/flang/docs/OpenACC.md b/flang/docs/OpenACC.md
index 9a166aa9bdde4..720afa5c830e4 100644
--- a/flang/docs/OpenACC.md
+++ b/flang/docs/OpenACC.md
@@ -33,7 +33,7 @@ local:
   or module, but it is allowed with a warning when same clause is used.
 * The OpenACC specification does not prohibit the same variable from appearing
   in multiple data clauses, but this is disallowed for variables appearing in
-  `private`, `firstprivate`, or `reduction` clauses.
+  `reduction` clauses.
 * The OpenACC specification does not prohibit the same variable from appearing
   multiple times in a `use_device` clause on a `host_data` construct, but this
   is disallowed.
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index 0abfd150cefe0..6893f51c97122 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -33,6 +33,7 @@ class IntrinsicTypeDefaultKinds;
 }
 
 namespace Fortran::parser {
+struct AccObject;
 struct Name;
 struct Program;
 class AllCookedSources;
@@ -336,6 +337,15 @@ class SemanticsContext {
   void NoteUsedSymbols(const UnorderedSymbolSet &);
   bool IsSymbolUsed(const Symbol &) const;
 
+  // Track same-kind duplicate AccObjects between resolve-directives and
+  // rewrite-parse-tree (e.g. the second `x` in `private(x, x)`).
+  void MarkAccObjectDuplicate(const parser::AccObject *o) {
+    accObjectDuplicates_.insert(o);
+  }
+  bool IsAccObjectDuplicate(const parser::AccObject *o) const {
+    return accObjectDuplicates_.count(o) != 0;
+  }
+
   void DumpSymbols(llvm::raw_ostream &);
 
   // Top-level ProgramTrees are owned by the SemanticsContext for persistence.
@@ -395,6 +405,7 @@ class SemanticsContext {
   std::map<const Symbol *, SourceName> moduleFileOutputRenamings_;
   UnorderedSymbolSet isDefined_;
   UnorderedSymbolSet isUsed_;
+  std::set<const parser::AccObject *> accObjectDuplicates_;
   std::list<ProgramTree> programTrees_;
 };
 
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 7ae867e19f276..25fb489d57475 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -384,8 +384,8 @@ class AccAttributeVisitor : DirectiveAttributeVisitor<llvm::acc::Directive> {
   Symbol *ResolveAccCommonBlockName(const parser::Name *);
   Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag);
   Symbol *DeclareOrMarkOtherAccessEntity(Symbol &, Symbol::Flag);
-  void CheckMultipleAppearances(
-      const parser::Name &, const Symbol &, Symbol::Flag);
+  void CheckMultipleAppearances(const parser::Name &, const Symbol &,
+      Symbol::Flag, const parser::AccObject *occurrence = nullptr);
   void AllowOnlyArrayAndSubArray(const parser::AccObjectList &objectList);
   void DoNotAllowAssumedSizedArray(const parser::AccObjectList &objectList);
   void AllowOnlyVariable(const parser::AccObject &object);
@@ -1875,7 +1875,7 @@ void AccAttributeVisitor::ResolveAccObject(
               if (auto *symbol{ResolveAcc(*name, accFlag, currScope())}) {
                 AddToContextObjectWithDSA(*symbol, accFlag);
                 if (dataSharingAttributeFlags.test(accFlag)) {
-                  CheckMultipleAppearances(*name, *symbol, accFlag);
+                  CheckMultipleAppearances(*name, *symbol, accFlag, &accObject);
                 }
               }
             } else {
@@ -1940,20 +1940,31 @@ Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity(
   return &object;
 }
 
-static bool WithMultipleAppearancesAccException(
-    const Symbol &symbol, Symbol::Flag flag) {
-  return false; // Place holder
-}
-
-void AccAttributeVisitor::CheckMultipleAppearances(
-    const parser::Name &name, const Symbol &symbol, Symbol::Flag accFlag) {
+void AccAttributeVisitor::CheckMultipleAppearances(const parser::Name &name,
+    const Symbol &symbol, Symbol::Flag accFlag,
+    const parser::AccObject *occurrence) {
   const auto *target{&symbol};
-  if (HasDataSharingAttributeObject(*target) &&
-      !WithMultipleAppearancesAccException(symbol, accFlag)) {
-    context_.Say(name.source,
-        "'%s' appears in more than one data-sharing clause "
-        "on the same OpenACC directive"_err_en_US,
-        name.ToString());
+  if (HasDataSharingAttributeObject(*target)) {
+    // A same-kind duplicate (e.g. private(x, x) or private(x) private(x))
+    // is benign: warn and tag this AccObject occurrence so rewrite-parse-tree
+    // can drop it from the clause list. Cross-kind duplicates (e.g.
+    // private(x) firstprivate(x)) remain hard errors.
+    //
+    // Reduction is excluded from the benign case: two reduction clauses
+    // with the same Symbol::Flag may still differ in operator, which is a
+    // real conflict that dedup would silently hide.
+    auto firstFlag{GetContext().FindSymbolWithDSA(*target)};
+    if (occurrence && firstFlag && *firstFlag == accFlag &&
+        accFlag != Symbol::Flag::AccReduction) {
+      context_.Warn(common::UsageWarning::OpenAccUsage, name.source,
+          "'%s' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored"_warn_en_US,
+          name.ToString());
+      context_.MarkAccObjectDuplicate(occurrence);
+    } else {
+      context_.Say(name.source,
+          "'%s' appears in more than one data-sharing clause on the same OpenACC directive"_err_en_US,
+          name.ToString());
+    }
   } else {
     AddDataSharingAttributeObject(*target);
   }
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index 7352c2a324616..500dd3f225889 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -62,6 +62,7 @@ class RewriteMutator {
   void Post(parser::IfConstruct &);
   void Post(parser::ReadStmt &);
   void Post(parser::WriteStmt &);
+  void Post(parser::AccObjectList &);
 
   // Name resolution yet implemented:
   // TODO: Can some/all of these now be enabled?
@@ -496,6 +497,15 @@ void RewriteMutator::Post(parser::WriteStmt &x) {
   FixMisparsedUntaggedNamelistName(x);
 }
 
+// Erase AccObjects recorded in the context by resolve-directives as same-kind
+// data-sharing duplicates. Cross-kind duplicates remain hard errors and never
+// reach this pass.
+void RewriteMutator::Post(parser::AccObjectList &x) {
+  x.v.remove_if([this](const parser::AccObject &o) {
+    return context_.IsAccObjectDuplicate(&o);
+  });
+}
+
 bool RewriteParseTree(SemanticsContext &context, parser::Program &program) {
   RewriteMutator mutator{context};
   parser::Walk(program, mutator);
diff --git a/flang/test/Lower/OpenACC/acc-dedup-private.f90 b/flang/test/Lower/OpenACC/acc-dedup-private.f90
new file mode 100644
index 0000000000000..6399324070831
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-dedup-private.f90
@@ -0,0 +1,63 @@
+! RUN: bbc -fopenacc -emit-hlfir %s -o - 2>/dev/null | FileCheck %s
+
+! Check that same-kind duplicate variables in OpenACC private/firstprivate
+! clauses lower without failure, and that each variable produces exactly one
+! acc.private / acc.firstprivate op (deduplication by rewrite-parse-tree).
+
+! -----------------------------------------------------------------------
+! private(x, x) -- duplicate within one clause
+
+subroutine test_private_pair(i)
+  integer :: x, i
+  !$acc parallel loop private(x, x)
+  do i = 1, 10
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_private_pair
+! x is privatized exactly once.
+! CHECK: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+! CHECK-NOT: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+
+! -----------------------------------------------------------------------
+! private(x, x, x) -- two duplicates (from the triple-occurrence review note)
+
+subroutine test_private_triple(i)
+  integer :: x, i
+  !$acc parallel loop private(x, x, x)
+  do i = 1, 10
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_private_triple
+! x is privatized exactly once even with three source occurrences.
+! CHECK: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+! CHECK-NOT: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+
+! -----------------------------------------------------------------------
+! private(x) private(x) -- duplicate across two separate clauses
+
+subroutine test_private_two_clauses(i)
+  integer :: x, i
+  !$acc parallel loop private(x) private(x)
+  do i = 1, 10
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_private_two_clauses
+! CHECK: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+! CHECK-NOT: acc.private varPtr({{.*}}) recipe(@privatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+
+! -----------------------------------------------------------------------
+! firstprivate(x, x)
+
+subroutine test_firstprivate_pair(i)
+  integer :: x, i
+  !$acc parallel loop firstprivate(x, x)
+  do i = 1, 10
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtest_firstprivate_pair
+! CHECK: acc.firstprivate varPtr({{.*}}) recipe(@firstprivatization_ref_i32) -> !fir.ref<i32> {name = "x"}
+! CHECK-NOT: acc.firstprivate varPtr({{.*}}) recipe(@firstprivatization_ref_i32) -> !fir.ref<i32> {name = "x"}
diff --git a/flang/test/Parser/acc-dedup-unparse.f90 b/flang/test/Parser/acc-dedup-unparse.f90
new file mode 100644
index 0000000000000..26fa422ff6aad
--- /dev/null
+++ b/flang/test/Parser/acc-dedup-unparse.f90
@@ -0,0 +1,28 @@
+! RUN: %flang_fc1 -fopenacc -fdebug-unparse -w %s | FileCheck %s
+
+! Verify that same-kind duplicate variables in OpenACC data-sharing clauses are
+! removed by rewrite-parse-tree, so each variable appears at most once when
+! unparsed.
+
+subroutine dedup_pair(x, i)
+  integer, intent(inout) :: x, i
+  !$acc parallel loop private(x, x)
+  do i = 1, 10
+  end do
+end subroutine
+! CHECK-LABEL: SUBROUTINE dedup_pair
+! CHECK: PRIVATE(x)
+! CHECK-NOT: PRIVATE(x,x)
+! CHECK-NOT: PRIVATE(x, x)
+
+subroutine dedup_triple(x, i)
+  integer, intent(inout) :: x, i
+  !$acc parallel loop private(x, x, x)
+  do i = 1, 10
+  end do
+end subroutine
+! CHECK-LABEL: SUBROUTINE dedup_triple
+! Three occurrences reduce to one.
+! CHECK: PRIVATE(x)
+! CHECK-NOT: PRIVATE(x,x)
+! CHECK-NOT: PRIVATE(x, x)
diff --git a/flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90 b/flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90
new file mode 100644
index 0000000000000..e29332578a0bf
--- /dev/null
+++ b/flang/test/Semantics/OpenACC/acc-dataclause-dedup.f90
@@ -0,0 +1,122 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenacc
+
+! Same-kind data-sharing duplicates on an OpenACC directive (e.g.
+! private(x, x), private(x) private(x), copyin(x, x) ...) are not errors:
+! resolve-directives warns and rewrite-parse-tree drops the duplicate
+! occurrences from the clause object lists. Cross-kind duplicates
+! (e.g. private(x) firstprivate(x)) and reduction duplicates remain
+! hard errors.
+
+program test_dataclause_dedup
+  implicit none
+  integer :: x, y, z, i
+
+  ! passThis1.f90 pattern: duplicate within a single PRIVATE clause.
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc parallel loop private(x, x)
+  do i = 1, 10
+  end do
+
+  ! passThis2.f90 pattern: duplicate within a single PRIVATE clause across
+  ! a continuation, with another variable in between.
+  !$acc parallel loop private(x, &
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc&               y, x)
+  do i = 1, 10
+  end do
+
+  ! passThis3.f90 pattern: duplicate across two separate PRIVATE clauses
+  ! on the same directive.
+  !$acc parallel loop private(x) &
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc&              private(y, x)
+  do i = 1, 10
+  end do
+
+  ! Same patterns generalize to FIRSTPRIVATE.
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc parallel loop firstprivate(x, x)
+  do i = 1, 10
+  end do
+
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc parallel loop firstprivate(x) firstprivate(y, x)
+  do i = 1, 10
+  end do
+
+  ! Multiple distinct duplicates on a single directive.
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !WARNING: 'y' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc parallel loop private(x, y, x, y)
+  do i = 1, 10
+  end do
+
+  ! Triple occurrence: two duplicates, both warned, only one survives dedup.
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !WARNING: 'x' appears more than once in the same kind of data-sharing clause on an OpenACC directive; duplicate ignored [-Wopenacc-usage]
+  !$acc parallel loop private(x, x, x)
+  do i = 1, 10
+  end do
+
+  ! Cross-kind duplicates on the same directive remain hard errors.
+  !ERROR: 'x' appears in more than one data-sharing clause on the same OpenACC directive
+  !$acc parallel loop private(x) firstprivate(x)
+  do i = 1, 10
+  end do
+
+  ! Reduction is excluded from the benign case: same-flag duplicates may
+  ! differ in operator, which is a real conflict.
+  !ERROR: 'x' appears in more than one data-sharing clause on the same OpenACC directive
+  !$acc parallel loop reduction(+:x) reduction(*:x)
+  do i = 1, 10
+  end do
+
+  ! Regression coverage for non-bare designators: the dedup machinery only
+  ! examines simple-Name DataRefs, so distinct array elements and array
+  ! sections must pass through untouched, with no warning and no erasure.
+  block
+    integer :: arr(10)
+    integer, target :: t1, t2
+    integer, pointer :: p
+    type :: pt
+      integer :: a
+      integer :: b
+    end type
+    type(pt) :: s
+
+    ! Different array elements -- not duplicates.
+    !$acc parallel loop private(arr(1), arr(2))
+    do i = 1, 10
+    end do
+
+    ! Different array sections -- not duplicates.
+    !$acc parallel loop private(arr(1:5), arr(6:10))
+    do i = 1, 10
+    end do
+
+    ! Same array element listed twice -- not deduped, since GetDesignatorName-
+    ! IfDataRef returns null for ArrayElement and CheckMultipleAppearances
+    ! is never invoked. Compiles without diagnostics.
+    !$acc parallel loop private(arr(1), arr(1))
+    do i = 1, 10
+    end do
+
+    ! Same array section listed twice -- same reasoning, no diagnostic.
+    !$acc parallel loop private(arr(1:5), arr(1:5))
+    do i = 1, 10
+    end do
+
+    ! Distinct structure components -- not duplicates.
+    !$acc parallel loop private(s%a, s%b)
+    do i = 1, 10
+    end do
+
+    ! Mixing a bare-name designator and an array-element designator on the
+    ! same symbol must not trigger dedup -- the array element doesn't go
+    ! through the duplicate check at all.
+    !$acc parallel loop private(arr, arr(1))
+    do i = 1, 10
+    end do
+  end block
+
+end program

From 6cdd3286fdb0fb1dcd40db1b88dfdd76d4f65546 Mon Sep 17 00:00:00 2001
From: Owen Rodley <orodley@google.com>
Date: Thu, 14 May 2026 15:24:15 +1000
Subject: [PATCH 29/95] Handle typeidCompatibleVTable in skipModuleSummaryEntry
 (#196849)

This method needs to match the set of cases handled in parseSummaryEntry.
---
 llvm/lib/AsmParser/LLParser.cpp             | 11 ++++++-----
 llvm/test/Assembler/thinlto-bad-summary1.ll |  2 +-
 llvm/test/Assembler/thinlto-vtable-skip.ll  | 11 +++++++++++
 3 files changed, 18 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/Assembler/thinlto-vtable-skip.ll

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 67e66c85033ba..990febaacbe48 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -1067,11 +1067,12 @@ bool LLParser::skipModuleSummaryEntry() {
   // support is in place we will look for the tokens corresponding to the
   // expected tags.
   if (Lex.getKind() != lltok::kw_gv && Lex.getKind() != lltok::kw_module &&
-      Lex.getKind() != lltok::kw_typeid && Lex.getKind() != lltok::kw_flags &&
-      Lex.getKind() != lltok::kw_blockcount)
-    return tokError(
-        "Expected 'gv', 'module', 'typeid', 'flags' or 'blockcount' at the "
-        "start of summary entry");
+      Lex.getKind() != lltok::kw_typeid &&
+      Lex.getKind() != lltok::kw_typeidCompatibleVTable &&
+      Lex.getKind() != lltok::kw_flags && Lex.getKind() != lltok::kw_blockcount)
+    return tokError("Expected 'gv', 'module', 'typeid', "
+                    "'typeidCompatibleVTable', 'flags' or 'blockcount' at the "
+                    "start of summary entry");
   if (Lex.getKind() == lltok::kw_flags)
     return parseSummaryIndexFlags();
   if (Lex.getKind() == lltok::kw_blockcount)
diff --git a/llvm/test/Assembler/thinlto-bad-summary1.ll b/llvm/test/Assembler/thinlto-bad-summary1.ll
index 8ff5e06b189a6..900ad73cc8435 100644
--- a/llvm/test/Assembler/thinlto-bad-summary1.ll
+++ b/llvm/test/Assembler/thinlto-bad-summary1.ll
@@ -2,7 +2,7 @@
 ; summary type label.
 ; RUN: not opt %s 2>&1 | FileCheck %s
 
-; CHECK: error: Expected 'gv', 'module', 'typeid', 'flags' or 'blockcount' at the start of summary entry
+; CHECK: error: Expected 'gv', 'module', 'typeid', 'typeidCompatibleVTable', 'flags' or 'blockcount' at the start of summary entry
 
 ; ModuleID = 'thinlto-function-summary-callgraph.ll'
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Assembler/thinlto-vtable-skip.ll b/llvm/test/Assembler/thinlto-vtable-skip.ll
new file mode 100644
index 0000000000000..b7e71251bc44d
--- /dev/null
+++ b/llvm/test/Assembler/thinlto-vtable-skip.ll
@@ -0,0 +1,11 @@
+; Disabling output means we'll just skip the summary entries, which is the code
+; path we're trying to test. There's no output to check against, so we have no
+; CHECKs.
+;
+; RUN: opt %s -disable-output
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+^0 = module: (path: "thinlto-vtable-skip.ll", hash: (0, 0, 0, 0, 0))
+^1 = typeidCompatibleVTable: (name: "_ZTS1A", summary: ((offset: 16, ^0)))

From 131d66c5332f44b0284ee8f56915a454edbdd16f Mon Sep 17 00:00:00 2001
From: Maksim Panchenko <maks@fb.com>
Date: Wed, 13 May 2026 22:40:00 -0700
Subject: [PATCH 30/95] [BOLT][DWARF] Support DW_FORM_ref_udata and
 DW_OP_regval_type (#197565)

Add support for DWARF opcodes seen in GCC-generated binaries:

- DW_FORM_ref_udata: ULEB128-encoded CU-relative DIE reference.

- DW_OP_regval_type (0xa5): DWARF5 expression opcode with operands
(SizeLEB, BaseTypeRef). The BaseTypeRef was not being updated when DIEs
were relocated because cloneExpression only handled (Size1, BaseTypeRef)
patterns. Generalized the first-operand copying to use raw bytes from
the data stream instead of assuming a single byte.

Fixes #188250

Assisted-by: Claude Opus 4.6/4.7
---
 bolt/lib/Core/DIEBuilder.cpp               | 13 ++--
 bolt/test/X86/dwarf5-form-ref-udata.s      | 70 ++++++++++++++++++
 bolt/test/X86/dwarf5-locexpr-regval-type.s | 83 ++++++++++++++++++++++
 3 files changed, 161 insertions(+), 5 deletions(-)
 create mode 100644 bolt/test/X86/dwarf5-form-ref-udata.s
 create mode 100644 bolt/test/X86/dwarf5-locexpr-regval-type.s

diff --git a/bolt/lib/Core/DIEBuilder.cpp b/bolt/lib/Core/DIEBuilder.cpp
index ef7ba54ff6ddc..072274f2578bb 100644
--- a/bolt/lib/Core/DIEBuilder.cpp
+++ b/bolt/lib/Core/DIEBuilder.cpp
@@ -705,7 +705,8 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
          Description.Op[0] == Encoding::BaseTypeRef) ||
         (Description.Op.size() == 2 &&
          Description.Op[1] == Encoding::BaseTypeRef &&
-         Description.Op[0] != Encoding::Size1))
+         Description.Op[0] != Encoding::Size1 &&
+         Description.Op[0] != Encoding::SizeLEB))
       BC.outs() << "BOLT-WARNING: [internal-dwarf-error]: unsupported DW_OP "
                    "encoding.\n";
 
@@ -713,9 +714,8 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
          Description.Op[0] == Encoding::BaseTypeRef) ||
         (Description.Op.size() == 2 &&
          Description.Op[1] == Encoding::BaseTypeRef &&
-         Description.Op[0] == Encoding::Size1)) {
-      // This code assumes that the other non-typeref operand fits into 1
-      // byte.
+         (Description.Op[0] == Encoding::Size1 ||
+          Description.Op[0] == Encoding::SizeLEB))) {
       assert(OpOffset < Op.getEndOffset());
       const uint32_t ULEBsize = Op.getEndOffset() - OpOffset - 1;
       (void)ULEBsize;
@@ -727,7 +727,9 @@ bool DIEBuilder::cloneExpression(const DataExtractor &Data,
       if (Description.Op.size() == 1) {
         RefOffset = Op.getRawOperand(0);
       } else {
-        OutputBuffer.push_back(Op.getRawOperand(0));
+        const StringRef FirstOpBytes =
+            Data.getData().slice(OpOffset + 1, Op.getOperandEndOffset(0));
+        OutputBuffer.append(FirstOpBytes.begin(), FirstOpBytes.end());
         RefOffset = Op.getRawOperand(1);
       }
       uint32_t Offset = 0;
@@ -903,6 +905,7 @@ void DIEBuilder::cloneAttribute(
   case dwarf::DW_FORM_ref2:
   case dwarf::DW_FORM_ref4:
   case dwarf::DW_FORM_ref8:
+  case dwarf::DW_FORM_ref_udata:
     cloneDieOffsetReferenceAttribute(Die, U, InputDIE, AttrSpec,
                                      Val.getUnit()->getOffset() +
                                          *Val.getAsRelativeReference());
diff --git a/bolt/test/X86/dwarf5-form-ref-udata.s b/bolt/test/X86/dwarf5-form-ref-udata.s
new file mode 100644
index 0000000000000..0b63a1711423d
--- /dev/null
+++ b/bolt/test/X86/dwarf5-form-ref-udata.s
@@ -0,0 +1,70 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
+# RUN: %clang %cflags -dwarf-5 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections 2>&1 | \
+# RUN:   FileCheck %s --check-prefix CHECK-BOLT
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck %s
+
+## Verify BOLT preserves DW_FORM_ref_udata (CU-relative ULEB128 DIE reference),
+## a form GCC may emit instead of DW_FORM_ref4.
+
+# CHECK:      DW_TAG_subprogram
+# CHECK:      DW_AT_type [DW_FORM_ref_udata]
+# CHECK-SAME: "int"
+
+# CHECK-BOLT-NOT: BOLT-WARNING
+
+	.text
+	.file	0 "." "main.cpp"
+	.globl	main
+main:
+.Lfunc_begin0:
+	.loc	0 1 0
+	xorl	%eax, %eax
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+
+## Force relocations against .text
+.reloc 0, R_X86_64_NONE
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1, 17, 1                # CU, has children
+	.byte	17, 1                   # DW_AT_low_pc, DW_FORM_addr
+	.byte	18, 6                   # DW_AT_high_pc, DW_FORM_data4
+	.byte	16, 23                  # DW_AT_stmt_list, DW_FORM_sec_offset
+	.byte	0, 0
+	.byte	2, 46, 0                # subprogram, no children
+	.byte	17, 1                   # DW_AT_low_pc, DW_FORM_addr
+	.byte	18, 6                   # DW_AT_high_pc, DW_FORM_data4
+	.byte	73, 21                  # DW_AT_type, DW_FORM_ref_udata
+	.byte	0, 0
+	.byte	3, 36, 0                # base_type, no children
+	.byte	3, 8                    # DW_AT_name, DW_FORM_string
+	.byte	0, 0
+	.byte	0
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0
+.Ldebug_info_start0:
+	.short	5                       # DWARF version
+	.byte	1                       # DW_UT_compile
+	.byte	8                       # Address size
+	.long	.debug_abbrev           # Abbrev offset
+	.byte	1                       # CU
+	.quad	.Lfunc_begin0           # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0  # DW_AT_high_pc
+	.long	.Lline_table_start0     # DW_AT_stmt_list
+	.byte	2                       # subprogram
+	.quad	.Lfunc_begin0           # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0  # DW_AT_high_pc
+	.uleb128 .Ltype_int-.Lcu_begin0    # DW_AT_type (DW_FORM_ref_udata)
+.Ltype_int:
+	.byte	3                       # base_type
+	.asciz	"int"                   # DW_AT_name
+	.byte	0                       # End children of CU
+.Ldebug_info_end0:
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:
diff --git a/bolt/test/X86/dwarf5-locexpr-regval-type.s b/bolt/test/X86/dwarf5-locexpr-regval-type.s
new file mode 100644
index 0000000000000..f60604c33b245
--- /dev/null
+++ b/bolt/test/X86/dwarf5-locexpr-regval-type.s
@@ -0,0 +1,83 @@
+# REQUIRES: system-linux
+
+# RUN: llvm-mc -dwarf-version=5 -filetype=obj -triple x86_64-unknown-linux %s -o %t.o
+# RUN: %clang %cflags -dwarf-5 %t.o -o %t.exe -Wl,-q
+# RUN: llvm-bolt %t.exe -o %t.bolt --update-debug-sections 2>&1 | \
+# RUN:   FileCheck %s --check-prefix CHECK-BOLT
+# RUN: llvm-dwarfdump --show-form --verbose --debug-info %t.bolt | FileCheck %s
+
+## Verify BOLT correctly handles DW_OP_regval_type. Its operands are
+## (ULEB128 register, ULEB128 base type DIE offset). The base type
+## reference must be updated when DIEs are relocated. Use a register
+## number that requires multi-byte ULEB128 encoding to exercise the
+## first-operand byte-copy path.
+
+# CHECK:      DW_TAG_variable
+# CHECK:      DW_AT_location [DW_FORM_exprloc]
+# CHECK-SAME: DW_OP_regval_type 0xc8 (0x[[#%.8x,TYPE:]] ->
+# CHECK:      0x[[#TYPE]]: DW_TAG_base_type
+
+# CHECK-BOLT-NOT: BOLT-WARNING
+
+	.text
+	.file	0 "." "main.cpp"
+	.globl	main
+main:
+.Lfunc_begin0:
+	.loc	0 1 0
+	xorl	%eax, %eax
+	retq
+.Lfunc_end0:
+	.size	main, .Lfunc_end0-main
+
+## Force relocations against .text
+.reloc 0, R_X86_64_NONE
+
+	.section	.debug_abbrev,"",@progbits
+	.byte	1, 17, 1                # CU, has children
+	.byte	17, 1                   # DW_AT_low_pc, DW_FORM_addr
+	.byte	18, 6                   # DW_AT_high_pc, DW_FORM_data4
+	.byte	16, 23                  # DW_AT_stmt_list, DW_FORM_sec_offset
+	.byte	0, 0
+	.byte	2, 46, 1                # subprogram, has children
+	.byte	17, 1                   # DW_AT_low_pc, DW_FORM_addr
+	.byte	18, 6                   # DW_AT_high_pc, DW_FORM_data4
+	.byte	0, 0
+	.byte	3, 52, 0                # variable, no children
+	.byte	2, 24                   # DW_AT_location, DW_FORM_exprloc
+	.byte	0, 0
+	.byte	4, 36, 0                # base_type, no children
+	.byte	3, 8                    # DW_AT_name, DW_FORM_string
+	.byte	0, 0
+	.byte	0
+
+	.section	.debug_info,"",@progbits
+.Lcu_begin0:
+	.long	.Ldebug_info_end0-.Ldebug_info_start0
+.Ldebug_info_start0:
+	.short	5                       # DWARF version
+	.byte	1                       # DW_UT_compile
+	.byte	8                       # Address size
+	.long	.debug_abbrev           # Abbrev offset
+	.byte	1                       # CU
+	.quad	.Lfunc_begin0           # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0  # DW_AT_high_pc
+	.long	.Lline_table_start0     # DW_AT_stmt_list
+	.byte	2                       # subprogram
+	.quad	.Lfunc_begin0           # DW_AT_low_pc
+	.long	.Lfunc_end0-.Lfunc_begin0  # DW_AT_high_pc
+	.byte	3                       # variable
+	.byte	.Lloc_end-.Lloc_start   # exprloc length
+.Lloc_start:
+	.byte	0xa5                    # DW_OP_regval_type
+	.uleb128 200                    # register 200 (multi-byte ULEB128)
+	.uleb128 .Ltype_int-.Lcu_begin0 # base type DIE offset
+.Lloc_end:
+	.byte	0                       # End children of subprogram
+.Ltype_int:
+	.byte	4                       # base_type
+	.asciz	"int"                   # DW_AT_name
+	.byte	0                       # End children of CU
+.Ldebug_info_end0:
+	.section	.debug_line,"",@progbits
+.Lline_table_start0:

From 35f5d7ea802eae78b26a5fb2a46f072acd15f49d Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Thu, 14 May 2026 06:46:23 +0100
Subject: [PATCH 31/95] [AArch64][GlobalISel] Fast-path common
 G_CONSTANT/G_BRCOND/G_FRAME_INDEX regbank mappings (#197383)

Returning the default register-bank mapping directly for these opcodes
is a -0.17% compile-time improvement on aarch64-O0-g.

https://llvm-compile-time-tracker.com/compare.php?from=b4aa4d4dcb6f1c8a00d1d1e53d2b353c97ec98b7&to=0779891fc6bf6a01e4f14d3f359e212c6ec52c0d&stat=instructions%3Au

Assisted-by: codex
---
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index d65ffb1c36814..e814316b0f2ed 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -969,6 +969,26 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         // We only care about the mapping of the destination for COPY.
         /*NumOperands*/ Opc == TargetOpcode::G_BITCAST ? 2 : 1);
   }
+  case TargetOpcode::G_CONSTANT: {
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    TypeSize Size = DstTy.getSizeInBits();
+    if (!DstTy.isPointer() && (!DstTy.isScalar() || Size < 32 || Size > 64))
+      break;
+    // Scalar constants materialize in GPRs.
+    [[fallthrough]];
+  }
+  case TargetOpcode::G_BRCOND:
+  case TargetOpcode::G_FRAME_INDEX: {
+    // Operand 0 is the only banked operand and is mapped to GPR.
+    return getInstructionMapping(
+        DefaultMappingID, /*Cost=*/1,
+        getOperandsMapping(
+            {getValueMapping(
+                 PMI_FirstGPR,
+                 MRI.getType(MI.getOperand(0).getReg()).getSizeInBits()),
+             nullptr}),
+        /*NumOperands=*/2);
+  }
   default:
     break;
   }

From 29206d7964435016ee03b45b2e7b95fbc1c32248 Mon Sep 17 00:00:00 2001
From: Kevin Sala Penades <salapenades1@llnl.gov>
Date: Wed, 13 May 2026 22:51:02 -0700
Subject: [PATCH 32/95] [OpenMP] Fix launch_bounds for OpenMP ompx_attribute
 (#195665)

This commit fixes the handling of `launch_bounds` within OpenMP's
`ompx_attribute`. The third attribute value, the maximum blocks, was not
parsed correctly.
---
 clang/include/clang/Sema/Sema.h      |  8 +++++---
 clang/lib/Parse/ParseOpenMP.cpp      |  5 +++--
 clang/lib/Sema/SemaDeclAttr.cpp      | 25 ++++++++++++++++---------
 clang/test/OpenMP/thread_limit_gpu.c | 14 ++++++++++----
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 5639034d5ae05..5202244cee2a7 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -5100,11 +5100,13 @@ class Sema final : public SemaBase {
   /// otherwise setting numParams to the appropriate value.
   bool CheckRegparmAttr(const ParsedAttr &attr, unsigned &value);
 
-  /// Create an CUDALaunchBoundsAttr attribute.
+  /// Create a CUDALaunchBoundsAttr attribute. By default, the function only
+  /// supports nvptx target architectures and skips MaxBlocks if it is previous
+  /// to sm_90. Use \p IgnoreArch to skip the architecture check.
   CUDALaunchBoundsAttr *CreateLaunchBoundsAttr(const AttributeCommonInfo &CI,
                                                Expr *MaxThreads,
-                                               Expr *MinBlocks,
-                                               Expr *MaxBlocks);
+                                               Expr *MinBlocks, Expr *MaxBlocks,
+                                               bool IgnoreArch = false);
 
   /// AddLaunchBoundsAttr - Adds a launch_bounds attribute to a particular
   /// declaration.
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 45a47ec797f01..7f3c575fb68bb 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -3846,12 +3846,13 @@ OMPClause *Parser::ParseOpenMPOMPXAttributesClause(bool ParseOnly) {
       continue;
     case ParsedAttr::AT_CUDALaunchBounds:
       if (!PA.checkAtLeastNumArgs(Actions, 1) ||
-          !PA.checkAtMostNumArgs(Actions, 2))
+          !PA.checkAtMostNumArgs(Actions, 3))
         continue;
       if (auto *A = Actions.CreateLaunchBoundsAttr(
               PA, PA.getArgAsExpr(0),
               PA.getNumArgs() > 1 ? PA.getArgAsExpr(1) : nullptr,
-              PA.getNumArgs() > 2 ? PA.getArgAsExpr(2) : nullptr))
+              PA.getNumArgs() > 2 ? PA.getArgAsExpr(2) : nullptr,
+              /*IgnoreArch=*/true))
         Attrs.push_back(A);
       continue;
     default:
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 55b6cbcbba57d..364f4de077ca7 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -6017,7 +6017,8 @@ static Expr *makeLaunchBoundsArgExpr(Sema &S, Expr *E,
 
 CUDALaunchBoundsAttr *
 Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads,
-                             Expr *MinBlocks, Expr *MaxBlocks) {
+                             Expr *MinBlocks, Expr *MaxBlocks,
+                             bool IgnoreArch) {
   CUDALaunchBoundsAttr TmpAttr(Context, CI, MaxThreads, MinBlocks, MaxBlocks);
   MaxThreads = makeLaunchBoundsArgExpr(*this, MaxThreads, TmpAttr, 0);
   if (!MaxThreads)
@@ -6030,14 +6031,20 @@ Sema::CreateLaunchBoundsAttr(const AttributeCommonInfo &CI, Expr *MaxThreads,
   }
 
   if (MaxBlocks) {
-    // '.maxclusterrank' ptx directive requires .target sm_90 or higher.
-    auto SM = getOffloadArch(Context.getTargetInfo());
-    if (SM == OffloadArch::Unknown || SM < OffloadArch::SM_90) {
-      Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
-          << OffloadArchToString(SM) << CI << MaxBlocks->getSourceRange();
-      // Ignore it by setting MaxBlocks to null;
-      MaxBlocks = nullptr;
-    } else {
+    // We might want to ignore the nvptx arch check, e.g., when processing the
+    // launch bounds attribute within ompx_attribute to support other archs.
+    if (!IgnoreArch) {
+      // '.maxclusterrank' ptx directive requires .target sm_90 or higher.
+      auto SM = getOffloadArch(Context.getTargetInfo());
+      if (SM == OffloadArch::Unknown || SM < OffloadArch::SM_90) {
+        Diag(MaxBlocks->getBeginLoc(), diag::warn_cuda_maxclusterrank_sm_90)
+            << OffloadArchToString(SM) << CI << MaxBlocks->getSourceRange();
+        // Ignore it by setting MaxBlocks to null;
+        MaxBlocks = nullptr;
+      }
+    }
+
+    if (MaxBlocks) {
       MaxBlocks = makeLaunchBoundsArgExpr(*this, MaxBlocks, TmpAttr, 2);
       if (!MaxBlocks)
         return nullptr;
diff --git a/clang/test/OpenMP/thread_limit_gpu.c b/clang/test/OpenMP/thread_limit_gpu.c
index 4d4f9159fd4b4..829b0a1b02d22 100644
--- a/clang/test/OpenMP/thread_limit_gpu.c
+++ b/clang/test/OpenMP/thread_limit_gpu.c
@@ -15,10 +15,13 @@ void foo(int N) {
 #pragma omp target teams distribute parallel for simd thread_limit(4)
   for (int i = 0; i < N; ++i)
     ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42))))
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 84))))
   for (int i = 0; i < N; ++i)
     ;
-#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 42)))) num_threads(22)
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 84)))) num_threads(22)
+  for (int i = 0; i < N; ++i)
+    ;
+#pragma omp target teams distribute parallel for simd ompx_attribute(__attribute__((launch_bounds(42, 84, 86)))) num_threads(20)
   for (int i = 0; i < N; ++i)
     ;
 }
@@ -29,13 +32,16 @@ void foo(int N) {
 // CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l15({{.*}}) #[[ATTR2:.+]] {
 // CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l18({{.*}}) #[[ATTR3:.+]] {
 // CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l21({{.*}}) #[[ATTR4:.+]] {
+// CHECK: define weak_odr protected {{amdgpu|spir}}_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+__Z3fooi_}}l24({{.*}}) #[[ATTR5:.+]] {
 
 // CHECK-AMDGPU: attributes #[[ATTR1]] = { {{.*}} "amdgpu-flat-work-group-size"="1,256" {{.*}} }
 // CHECK-AMDGPU: attributes #[[ATTR2]] = { {{.*}} "amdgpu-flat-work-group-size"="1,4" {{.*}} }
 // CHECK-AMDGPU: attributes #[[ATTR3]] = { {{.*}} "amdgpu-flat-work-group-size"="1,42" {{.*}} }
 // CHECK-AMDGPU: attributes #[[ATTR4]] = { {{.*}} "amdgpu-flat-work-group-size"="1,22" {{.*}} }
+// CHECK-AMDGPU: attributes #[[ATTR5]] = { {{.*}} "amdgpu-flat-work-group-size"="1,20" "amdgpu-max-num-workgroups"="86,1,1" {{.*}} }
 
 // CHECK-SPIRV: attributes #[[ATTR1]] = { {{.*}} "omp_target_thread_limit"="256" {{.*}} }
 // CHECK-SPIRV: attributes #[[ATTR2]] = { {{.*}} "omp_target_thread_limit"="4"  {{.*}} }
-// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="42" {{.*}} }
-// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="42" "omp_target_thread_limit"="22" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR3]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="42" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR4]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="22" {{.*}} }
+// CHECK-SPIRV: attributes #[[ATTR5]] = { {{.*}} "omp_target_num_teams"="84" "omp_target_thread_limit"="20" {{.*}} }

From 1d93fc4f74fe29481964bb11dd838b800544ca43 Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey@raspberryginger.com>
Date: Thu, 14 May 2026 06:01:13 +0000
Subject: [PATCH 33/95] [libc] Add LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS
 CMake flag (#197537)

Adds a new CMake option, OFF by default, to gate entrypoints with
known-incomplete implementations. This lets developers build and test
partially-implemented functions without exposing them to production
users.

The motivating case is `sysconf`, which only handles three of the
required `_SC_*` constants (`_SC_PAGESIZE`, `_SC_NPROCESSORS_CONF`,
`_SC_NPROCESSORS_ONLN`) and returns `EINVAL` for everything else.
Functions like this are useful to have in a build for testing progress,
but shouldn't be part of a default full build until the implementation
is complete.

Changes:
- `libc/CMakeLists.txt`: adds
`option(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS ... OFF)`
- `libc/cmake/modules/LLVMLibCCompileOptionRules.cmake`: propagates
`-DLIBC_EXPERIMENTAL_ENTRYPOINTS` when ON
- `libc/cmake/modules/LLVMLibCTestRules.cmake`: same for test compile
options
- `libc/config/linux/{x86_64,aarch64,riscv}/entrypoints.txt`: moves
`sysconf` behind the new flag

The flag does not require `LLVM_LIBC_FULL_BUILD` since overlay builds
may also have incomplete entrypoints that benefit from this gating.
---
 libc/CMakeLists.txt                       | 2 ++
 libc/config/linux/aarch64/entrypoints.txt | 7 ++++++-
 libc/config/linux/riscv/entrypoints.txt   | 7 ++++++-
 libc/config/linux/x86_64/entrypoints.txt  | 7 ++++++-
 4 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index cf2a8cad42154..3b5f3949b286d 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -140,6 +140,8 @@ option(LLVM_LIBC_FULL_BUILD "Build and test LLVM libc as if it is the full libc"
 option(LLVM_LIBC_IMPLEMENTATION_DEFINED_TEST_BEHAVIOR "Build LLVM libc tests assuming our implementation-defined behavior" ON)
 option(LLVM_LIBC_ENABLE_LINTING "Enables linting of libc source files" OFF)
 option(LLVM_LIBC_ALL_HEADERS "Outputs all functions in header files, regardless of whether they are enabled on this target" OFF)
+option(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS
+  "Enable entrypoints with known-incomplete implementations (off by default)" OFF)
 
 option(LIBC_CONFIG_PATH "The path to user provided folder that configures the build for the target system." OFF)
 
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index e62bc67e2d5ca..e61b127e42102 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -387,7 +387,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.setsid
     libc.src.unistd.symlink
     libc.src.unistd.symlinkat
-    libc.src.unistd.sysconf
     libc.src.unistd.truncate
     libc.src.unistd.unlink
     libc.src.unistd.unlinkat
@@ -1265,6 +1264,12 @@ if(LLVM_LIBC_FULL_BUILD)
   )
 endif()
 
+if(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS)
+  list(APPEND TARGET_LIBC_ENTRYPOINTS
+    libc.src.unistd.sysconf
+  )
+endif()
+
 set(TARGET_LIBMVEC_ENTRYPOINTS)
 
 if(LIBC_COMPILER_HAS_EXT_VECTOR_TYPE)
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index d1c52dffdb6e7..7a34cc5fba201 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -390,7 +390,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.setsid
     libc.src.unistd.symlink
     libc.src.unistd.symlinkat
-    libc.src.unistd.sysconf
     libc.src.unistd.truncate
     libc.src.unistd.unlink
     libc.src.unistd.unlinkat
@@ -1399,6 +1398,12 @@ if(LLVM_LIBC_FULL_BUILD)
   )
 endif()
 
+if(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS)
+  list(APPEND TARGET_LIBC_ENTRYPOINTS
+    libc.src.unistd.sysconf
+  )
+endif()
+
 set(TARGET_LLVMLIBC_ENTRYPOINTS
   ${TARGET_LIBC_ENTRYPOINTS}
   ${TARGET_LIBM_ENTRYPOINTS}
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 73b4b3fcd191f..00c94e1e9b5a0 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -408,7 +408,6 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.unistd.setsid
     libc.src.unistd.symlink
     libc.src.unistd.symlinkat
-    libc.src.unistd.sysconf
     libc.src.unistd.truncate
     libc.src.unistd.unlink
     libc.src.unistd.unlinkat
@@ -1489,6 +1488,12 @@ if(LLVM_LIBC_FULL_BUILD)
   )
 endif()
 
+if(LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS)
+  list(APPEND TARGET_LIBC_ENTRYPOINTS
+    libc.src.unistd.sysconf
+  )
+endif()
+
 set(TARGET_LIBMVEC_ENTRYPOINTS)
 
 if(LIBC_COMPILER_HAS_EXT_VECTOR_TYPE)

From 3272c569e0848806c3d6bef7b5a8ffdb01c8955a Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Thu, 14 May 2026 08:52:49 +0200
Subject: [PATCH 34/95] [AMDGPU] Remove RCP_IFLAG combine (#197426)

The combine was added in D48569 8 years ago with the aim of preserving
flags, but the current LangRef says the status flags are not observable
in the default FP environment.

The main motivation for this change is to enable scalar float reciprocal
generation v_s_rcp_f32 on newer hardware. There is no v_s_rcp_iflag_f32,
so the combine effectively blocks the selection.
See: pseudo-scalar-transcendental.ll.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   6 -
 .../AMDGPU/agpr-copy-no-free-registers.ll     |   4 +-
 ...amdgpu-codegenprepare-fold-binop-select.ll |   4 +-
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 320 +++++++++---------
 .../amdgpu-simplify-libcall-rootn-codegen.ll  |   6 +-
 llvm/test/CodeGen/AMDGPU/bypass-div.ll        |  10 +-
 llvm/test/CodeGen/AMDGPU/idiv-licm.ll         |  48 +--
 .../insert_waitcnt_for_precise_memory.ll      |  13 +-
 llvm/test/CodeGen/AMDGPU/med3-knownbits.ll    |   2 +-
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   2 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll        |  64 ++--
 llvm/test/CodeGen/AMDGPU/pr155452.ll          |   2 +-
 .../AMDGPU/pseudo-scalar-transcendental.ll    |  29 +-
 llvm/test/CodeGen/AMDGPU/rcp_iflag.ll         |  85 ++++-
 llvm/test/CodeGen/AMDGPU/sdiv.ll              |  66 ++--
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            |  48 +--
 llvm/test/CodeGen/AMDGPU/sdivrem24.ll         |  26 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              |  42 +--
 llvm/test/CodeGen/AMDGPU/srem64.ll            |  40 +--
 llvm/test/CodeGen/AMDGPU/udiv.ll              | 104 +++---
 llvm/test/CodeGen/AMDGPU/udiv64.ll            |  36 +-
 llvm/test/CodeGen/AMDGPU/udivrem.ll           |  28 +-
 llvm/test/CodeGen/AMDGPU/udivrem24.ll         |  86 +++--
 llvm/test/CodeGen/AMDGPU/urem64.ll            |  36 +-
 llvm/test/CodeGen/AMDGPU/wave32.ll            |   8 +-
 25 files changed, 577 insertions(+), 538 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5973231eb5656..3c830726bf98a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15467,12 +15467,6 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
                                  SDLoc(N), VT);
   }
 
-  if (VT == MVT::f32 && (N0.getOpcode() == ISD::UINT_TO_FP ||
-                         N0.getOpcode() == ISD::SINT_TO_FP)) {
-    return DCI.DAG.getNode(AMDGPUISD::RCP_IFLAG, SDLoc(N), VT, N0,
-                           N->getFlags());
-  }
-
   // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
   if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
       N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 4c10e4d459849..daf7fed3731f5 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -520,7 +520,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s0
 ; GFX908-NEXT:    v_mov_b32_e32 v17, 0
-; GFX908-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX908-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -683,7 +683,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[2:3], 0, 0
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v0
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 62059cd989ba2..a2e3c9aa8acc5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -140,7 +140,7 @@ define i32 @select_sdiv_lhs_opaque_const0_i32(i1 %cond) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -222,7 +222,7 @@ define i32 @select_sdiv_lhs_opaque_const1_i32(i1 %cond) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
 ; GCN-NEXT:    s_mov_b32 s4, 0xf4240
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 018eb779fc815..1659ca62a0516 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s5
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
@@ -76,7 +76,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
@@ -142,7 +142,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s5
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s2, v0
@@ -170,7 +170,7 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
@@ -247,7 +247,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX6-NEXT:    s_abs_i32 s7, s4
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -285,7 +285,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s4
 ; GFX9-NEXT:    s_xor_b32 s3, s2, s3
 ; GFX9-NEXT:    s_abs_i32 s2, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -364,7 +364,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s5
 ; GFX6-NEXT:    s_abs_i32 s6, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 31
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -398,7 +398,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX9-NEXT:    s_ashr_i32 s4, s2, 31
 ; GFX9-NEXT:    s_abs_i32 s2, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
@@ -457,7 +457,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -478,7 +478,7 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v2
@@ -527,7 +527,7 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_and_b32 s0, s6, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -550,7 +550,7 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -606,7 +606,7 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -631,7 +631,7 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
@@ -687,7 +687,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX6-NEXT:    s_xor_b32 s2, s2, s7
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX6-NEXT:    s_or_b32 s4, s2, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -715,7 +715,7 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -766,7 +766,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    s_mov_b32 s2, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -784,7 +784,7 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -830,7 +830,7 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte1_e32 v0, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s6
 ; GFX6-NEXT:    s_lshr_b32 s2, s6, 8
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
@@ -851,7 +851,7 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s2
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v2, v1
@@ -908,7 +908,7 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    s_sext_i32_i8 s5, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -933,7 +933,7 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    s_sext_i32_i8 s2, s2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
@@ -989,7 +989,7 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX6-NEXT:    s_sext_i32_i8 s3, s6
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GFX6-NEXT:    s_xor_b32 s2, s3, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX6-NEXT:    s_lshr_b32 s4, s6, 8
 ; GFX6-NEXT:    s_or_b32 s5, s2, 1
@@ -1018,7 +1018,7 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    s_sext_i32_i8 s3, s6
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GFX9-NEXT:    s_xor_b32 s2, s3, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_lshr_b32 s4, s6, 8
 ; GFX9-NEXT:    s_or_b32 s5, s2, 1
@@ -1184,13 +1184,13 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s13
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s14
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s15
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -1281,8 +1281,8 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s14
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -1307,7 +1307,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_mul_i32 s3, s3, s4
 ; GFX9-NEXT:    s_mul_hi_u32 s3, s4, s3
 ; GFX9-NEXT:    s_add_i32 s4, s4, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v2
 ; GFX9-NEXT:    s_mul_hi_u32 s3, s9, s4
 ; GFX9-NEXT:    s_mul_i32 s4, s3, s13
 ; GFX9-NEXT:    s_sub_i32 s4, s9, s4
@@ -1327,7 +1327,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s10, s5
 ; GFX9-NEXT:    s_mul_i32 s5, s4, s14
 ; GFX9-NEXT:    s_sub_i32 s5, s10, s5
@@ -1502,8 +1502,8 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s13
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -1524,7 +1524,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s13
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
@@ -1542,7 +1542,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_cselect_b32 s7, s1, s0
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s14
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
@@ -1590,9 +1590,9 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s14
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -1632,7 +1632,7 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s5
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s5, s4
 ; GFX9-NEXT:    s_add_i32 s5, s5, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mul_hi_u32 s4, s10, s5
 ; GFX9-NEXT:    s_mul_i32 s4, s4, s14
 ; GFX9-NEXT:    s_sub_i32 s4, s10, s4
@@ -1849,7 +1849,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX6-NEXT:    s_sub_i32 s1, 0, s0
 ; GFX6-NEXT:    s_xor_b32 s2, s8, s12
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
@@ -1873,7 +1873,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    s_xor_b32 s6, s9, s13
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, 1, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
@@ -1900,7 +1900,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s7, v4
@@ -1925,7 +1925,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, 1, v4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v6
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v6
 ; GFX6-NEXT:    s_abs_i32 s1, s11
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -1970,7 +1970,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    s_abs_i32 s2, s8
 ; GFX9-NEXT:    s_xor_b32 s1, s8, s12
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1994,7 +1994,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    s_sub_i32 s8, s0, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_abs_i32 s6, s9
 ; GFX9-NEXT:    s_xor_b32 s3, s9, s13
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 31
@@ -2020,7 +2020,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s3
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s1
 ; GFX9-NEXT:    s_sub_i32 s3, s0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_abs_i32 s6, s10
 ; GFX9-NEXT:    s_xor_b32 s2, s10, s14
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 31
@@ -2047,7 +2047,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    s_xor_b32 s5, s6, s2
 ; GFX9-NEXT:    s_sub_i32 s6, 0, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    s_sub_i32 s2, s5, s2
 ; GFX9-NEXT:    s_abs_i32 s4, s11
 ; GFX9-NEXT:    s_xor_b32 s3, s11, s15
@@ -2247,7 +2247,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX6-NEXT:    s_sub_i32 s1, 0, s0
 ; GFX6-NEXT:    s_ashr_i32 s2, s8, 31
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
@@ -2269,7 +2269,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_sub_i32 s3, 0, s1
 ; GFX6-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX6-NEXT:    s_sub_i32 s7, s0, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -2292,7 +2292,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    s_sub_i32 s2, 0, s1
 ; GFX6-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX6-NEXT:    s_sub_i32 s6, s0, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_ashr_i32 s8, s10, 31
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2315,7 +2315,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s10
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s7
@@ -2356,7 +2356,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_sub_i32 s3, 0, s0
 ; GFX9-NEXT:    s_abs_i32 s2, s8
 ; GFX9-NEXT:    s_ashr_i32 s1, s8, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
@@ -2377,7 +2377,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s1
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    s_sub_i32 s8, s0, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_abs_i32 s6, s9
 ; GFX9-NEXT:    s_ashr_i32 s3, s9, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2400,7 +2400,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s3
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s1
 ; GFX9-NEXT:    s_sub_i32 s3, s0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_abs_i32 s6, s10
 ; GFX9-NEXT:    s_ashr_i32 s2, s10, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2423,7 +2423,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    s_xor_b32 s5, s6, s2
 ; GFX9-NEXT:    s_sub_i32 s6, 0, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v1
 ; GFX9-NEXT:    s_sub_i32 s2, s5, s2
 ; GFX9-NEXT:    s_abs_i32 s4, s11
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
@@ -2554,10 +2554,10 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
@@ -2572,7 +2572,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v5, v6
@@ -2583,7 +2583,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_lshr_b32 s4, s9, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX6-NEXT:    v_rcp_f32_e32 v7, v5
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2611,12 +2611,12 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_and_b32 s6, s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v4
@@ -2628,7 +2628,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_trunc_f32_e32 v2, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v4
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    s_lshr_b32 s0, s3, 16
@@ -2639,7 +2639,7 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX9-NEXT:    v_rcp_f32_e32 v8, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
@@ -2769,10 +2769,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v3
@@ -2790,7 +2790,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s5
 ; GFX6-NEXT:    s_and_b32 s5, s9, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v1
 ; GFX6-NEXT:    s_lshr_b32 s4, s11, 16
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v3, v4
@@ -2798,7 +2798,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; GFX6-NEXT:    s_lshr_b32 s5, s9, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v6, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX6-NEXT:    v_rcp_f32_e32 v7, v4
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_mad_f32 v3, -v1, v2, v3
@@ -2836,10 +2836,10 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    s_and_b32 s4, s3, 0xffff
@@ -2854,7 +2854,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
@@ -2866,7 +2866,7 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX9-NEXT:    v_rcp_f32_e32 v8, v5
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
@@ -3006,7 +3006,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s8
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -3021,7 +3021,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
@@ -3037,7 +3037,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s9
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
@@ -3053,7 +3053,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v4
 ; GFX6-NEXT:    s_ashr_i32 s4, s9, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
@@ -3085,7 +3085,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX9-NEXT:    s_or_b32 s8, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
@@ -3099,7 +3099,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s3
@@ -3116,7 +3116,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_add_u32_e32 v4, s0, v4
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_or_b32 s0, s0, 1
@@ -3132,7 +3132,7 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_add_u32_e32 v1, s0, v5
 ; GFX9-NEXT:    s_ashr_i32 s0, s1, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_or_b32 s2, s0, 1
@@ -3276,7 +3276,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s8
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -3292,7 +3292,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
@@ -3314,7 +3314,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v1
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -3331,7 +3331,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX6-NEXT:    s_ashr_i32 s5, s9, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v4, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_lshr_b32 s6, s9, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s11, 16
@@ -3367,7 +3367,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_sext_i32_i16 s9, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX9-NEXT:    s_xor_b32 s4, s9, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX9-NEXT:    s_or_b32 s10, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
@@ -3384,7 +3384,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    v_add_u32_e32 v1, s4, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s10
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX9-NEXT:    s_sext_i32_i16 s8, s1
@@ -3401,7 +3401,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
 ; GFX9-NEXT:    s_xor_b32 s0, s8, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v3
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_or_b32 s0, s0, 1
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s10, v0
@@ -3418,7 +3418,7 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, s2
 ; GFX9-NEXT:    s_ashr_i32 s2, s1, 16
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v5, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX9-NEXT:    s_xor_b32 s0, s2, s3
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_or_b32 s4, s0, 1
@@ -3475,7 +3475,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_u32 s2, s6, 0x30008
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX6-NEXT:    s_and_b32 s4, s6, 7
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
 ; GFX6-NEXT:    s_mov_b32 s2, -1
@@ -3497,7 +3497,7 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s3, s2, 0x30008
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    s_and_b32 s2, s2, 7
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v1
@@ -3545,7 +3545,7 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_u32 s2, s6, 0x30008
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX6-NEXT:    s_and_b32 s3, s6, 7
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v2, s3
 ; GFX6-NEXT:    s_lshr_b32 s2, s6, 8
@@ -3569,7 +3569,7 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_bfe_u32 s0, s2, 0x30008
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    s_and_b32 s1, s2, 7
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, s1
 ; GFX9-NEXT:    s_lshr_b32 s0, s2, 8
@@ -3630,7 +3630,7 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    s_bfe_i32 s5, s6, 0x30000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -3656,7 +3656,7 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_bfe_i32 s2, s2, 0x30000
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s2
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_or_b32 s4, s2, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
@@ -3713,7 +3713,7 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX6-NEXT:    s_bfe_i32 s3, s6, 0x30000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GFX6-NEXT:    s_xor_b32 s2, s3, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX6-NEXT:    s_lshr_b32 s4, s6, 8
 ; GFX6-NEXT:    s_or_b32 s5, s2, 1
@@ -3742,7 +3742,7 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_bfe_i32 s1, s2, 0x30000
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s1
 ; GFX9-NEXT:    s_xor_b32 s0, s1, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 8
 ; GFX9-NEXT:    s_or_b32 s6, s0, 1
@@ -3845,10 +3845,10 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
@@ -3862,7 +3862,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_and_b32 s4, s9, 0xffff
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -3890,10 +3890,10 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
 ; GFX9-NEXT:    s_and_b32 s0, s3, 0xffff
@@ -3908,7 +3908,7 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_mad_f32 v3, -v2, v1, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v4
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v7
@@ -4013,10 +4013,10 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_and_b32 s4, s8, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s5
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_lshr_b32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX6-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX6-NEXT:    v_mad_f32 v1, -v3, v0, v1
@@ -4031,7 +4031,7 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
 ; GFX6-NEXT:    s_and_b32 s6, s9, 0xffff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s6
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
@@ -4065,9 +4065,9 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_and_b32 s8, s0, 0xffff
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v1
 ; GFX9-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX9-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v4, v4
@@ -4082,7 +4082,7 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v5
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v3
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v5, v6
@@ -4197,7 +4197,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s8
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -4212,7 +4212,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
@@ -4228,7 +4228,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s4, v3
 ; GFX6-NEXT:    s_sext_i32_i16 s4, s9
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
@@ -4258,7 +4258,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_sext_i32_i16 s5, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s5, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX9-NEXT:    s_or_b32 s8, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
@@ -4273,7 +4273,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX9-NEXT:    v_add_u32_e32 v2, s4, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s3
@@ -4289,7 +4289,7 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    v_add_u32_e32 v3, s0, v4
 ; GFX9-NEXT:    s_sext_i32_i16 s0, s1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v4, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_or_b32 s2, s0, 1
@@ -4406,7 +4406,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s8
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -4422,7 +4422,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_ashr_i32 s5, s8, 16
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v2, s5
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s10
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_lshr_b32 s6, s8, 16
@@ -4442,7 +4442,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s9
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s7
 ; GFX6-NEXT:    s_or_b32 s7, s4, 1
@@ -4474,7 +4474,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_sext_i32_i16 s9, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s9
 ; GFX9-NEXT:    s_xor_b32 s4, s9, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX9-NEXT:    s_or_b32 s10, s4, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -4491,7 +4491,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    v_add_u32_e32 v1, s4, v2
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s10
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_or_b32 s2, s2, 1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v2, v3
@@ -4507,7 +4507,7 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    s_sext_i32_i16 s3, s1
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s3
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX9-NEXT:    s_xor_b32 s0, s3, s2
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 30
 ; GFX9-NEXT:    s_or_b32 s4, s0, 1
@@ -4614,7 +4614,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_and_b32 s6, s10, 0x7fff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], 30
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s9
@@ -4622,7 +4622,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GFX6-NEXT:    v_trunc_f32_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v4, s9
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX6-NEXT:    v_rcp_f32_e32 v5, v3
 ; GFX6-NEXT:    s_and_b32 s5, s6, 0x7fff
 ; GFX6-NEXT:    v_mad_f32 v1, -v2, v0, v1
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
@@ -4633,7 +4633,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_mad_f32 v4, -v0, v3, v4
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s8
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v3
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v0, vcc
@@ -4668,13 +4668,13 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_bfe_u32 s8, s2, 0xf000f
 ; GFX9-NEXT:    s_lshr_b64 s[2:3], s[6:7], 30
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_bfe_u32 s3, s6, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX9-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0x7fff
 ; GFX9-NEXT:    v_mad_f32 v1, -v3, v0, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
@@ -4686,7 +4686,7 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_mad_f32 v5, -v0, v4, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v0, vcc
@@ -4797,7 +4797,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_and_b32 s6, s10, 0x7fff
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[4:5], 30
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_and_b32 s5, s6, 0x7fff
 ; GFX6-NEXT:    s_bfe_u32 s6, s4, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v3, s6
@@ -4810,7 +4810,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v3
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 15
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s10, v0
@@ -4819,7 +4819,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_trunc_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mad_f32 v1, -v0, v3, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v6, v2
 ; GFX6-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v3
 ; GFX6-NEXT:    s_mov_b32 s1, s9
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, 0, v0, vcc
@@ -4858,7 +4858,7 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_and_b32 s3, s2, 0x7fff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_and_b32 s8, s4, 0x7fff
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_lshr_b64 s[4:5], s[6:7], 30
 ; GFX9-NEXT:    s_bfe_u32 s5, s6, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s5
@@ -4870,13 +4870,13 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_and_b32 s3, s4, 0x7fff
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s9
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v6, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v3, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v5, v6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s8
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v3
 ; GFX9-NEXT:    v_mad_f32 v5, -v1, v4, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v4
@@ -5002,7 +5002,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], 30
 ; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_xor_b32 s5, s5, s7
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
 ; GFX6-NEXT:    s_or_b32 s5, s5, 1
@@ -5018,7 +5018,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, s5, v2
 ; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf000f
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s7, s4, 1
@@ -5034,7 +5034,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, s4, v3
 ; GFX6-NEXT:    s_bfe_i32 s4, s6, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX6-NEXT:    s_xor_b32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
@@ -5070,7 +5070,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_bfe_i32 s5, s2, 0xf0000
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GFX9-NEXT:    s_xor_b32 s3, s5, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v0
 ; GFX9-NEXT:    s_ashr_i32 s3, s3, 30
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], 30
 ; GFX9-NEXT:    s_or_b32 s3, s3, 1
@@ -5085,7 +5085,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_bfe_i32 s2, s2, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v0
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s5
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    v_add_u32_e32 v3, s3, v3
@@ -5102,7 +5102,7 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_add_u32_e32 v4, s2, v4
 ; GFX9-NEXT:    s_bfe_i32 s2, s4, 0xf0000
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v5, v0
 ; GFX9-NEXT:    s_xor_b32 s2, s2, s3
 ; GFX9-NEXT:    s_ashr_i32 s2, s2, 30
 ; GFX9-NEXT:    s_or_b32 s4, s2, 1
@@ -5227,7 +5227,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_bfe_i32 s12, s10, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v1, s12
 ; GFX6-NEXT:    s_xor_b32 s5, s12, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX6-NEXT:    s_lshr_b64 s[6:7], s[10:11], 30
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 30
 ; GFX6-NEXT:    s_and_b32 s7, s6, 0x7fff
@@ -5249,7 +5249,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_bfe_i32 s5, s10, 0xf000f
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s10, v0
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v0, s5
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s10, s4, 1
@@ -5266,7 +5266,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX6-NEXT:    s_bfe_i32 s5, s6, 0xf0000
 ; GFX6-NEXT:    v_cvt_f32_i32_e32 v3, s5
 ; GFX6-NEXT:    s_xor_b32 s4, s5, s4
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX6-NEXT:    v_rcp_f32_e32 v4, v1
 ; GFX6-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX6-NEXT:    s_or_b32 s6, s4, 1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s14
@@ -5304,7 +5304,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], 30
 ; GFX9-NEXT:    s_bfe_i32 s7, s2, 0xf0000
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v1, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v0
 ; GFX9-NEXT:    s_xor_b32 s5, s7, s5
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 30
 ; GFX9-NEXT:    s_lshr_b32 s3, s2, 15
@@ -5325,7 +5325,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s6
 ; GFX9-NEXT:    s_bfe_i32 s6, s2, 0xf000f
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v2, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX9-NEXT:    s_xor_b32 s5, s6, s5
 ; GFX9-NEXT:    s_ashr_i32 s5, s5, 30
 ; GFX9-NEXT:    s_or_b32 s5, s5, 1
@@ -5342,7 +5342,7 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    s_bfe_i32 s4, s4, 0xf0000
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, s4
 ; GFX9-NEXT:    s_xor_b32 s4, s4, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 30
 ; GFX9-NEXT:    s_or_b32 s6, s4, 1
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s13
@@ -5656,8 +5656,8 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_sub_i32 s1, 0, s0
 ; GFX6-NEXT:    s_lshl_b32 s2, 0x1000, s11
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s2
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v0
@@ -5710,9 +5710,9 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -5994,8 +5994,8 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -6043,9 +6043,9 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_lshl_b32 s6, 0x1000, s3
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s6
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -6461,7 +6461,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX6-NEXT:    s_sub_i32 s7, 0, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v0
@@ -6488,7 +6488,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX6-NEXT:    s_xor_b32 s3, s1, s3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    s_abs_i32 s1, s1
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s0, v0
@@ -6530,7 +6530,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_lshl_b32 s7, 0x1000, s3
 ; GFX9-NEXT:    s_abs_i32 s3, s0
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s6
 ; GFX9-NEXT:    s_ashr_i32 s0, s0, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -6555,7 +6555,7 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_xor_b32 s5, s6, s0
 ; GFX9-NEXT:    s_sub_i32 s6, 0, s8
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s0, s5, s0
 ; GFX9-NEXT:    s_xor_b32 s4, s1, s7
 ; GFX9-NEXT:    s_abs_i32 s1, s1
@@ -6907,7 +6907,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX6-NEXT:    s_lshl_b32 s3, 0x1000, s3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -6930,7 +6930,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_sub_i32 s6, 0, s3
 ; GFX6-NEXT:    s_abs_i32 s8, s1
 ; GFX6-NEXT:    s_xor_b32 s2, s2, s0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_sub_i32 s0, s2, s0
 ; GFX6-NEXT:    s_ashr_i32 s1, s1, 31
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
@@ -6968,7 +6968,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
 ; GFX9-NEXT:    s_abs_i32 s0, s0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshl_b32 s3, 0x1000, s3
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -6990,7 +6990,7 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b32 s0, s0, s6
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s0, s0, s6
 ; GFX9-NEXT:    s_ashr_i32 s4, s1, 31
 ; GFX9-NEXT:    s_abs_i32 s1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll
index 5f6a38018be20..7af5b0e177b7f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn-codegen.ll
@@ -17,7 +17,7 @@ define float @test_rootn_afn_f32(float %x, i32 %y) #0 {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 32, vcc
 ; CHECK-NEXT:    v_ldexp_f32 v4, |v0|, v4
 ; CHECK-NEXT:    v_log_f32_e32 v4, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; CHECK-NEXT:    v_sub_f32_e32 v3, v4, v3
@@ -75,8 +75,8 @@ define <2 x float> @test_rootn_afn_v2f32(<2 x float> %x, <2 x i32> %y) #0 {
 ; CHECK-NEXT:    v_log_f32_e32 v8, v8
 ; CHECK-NEXT:    v_ldexp_f32 v9, |v0|, v9
 ; CHECK-NEXT:    v_log_f32_e32 v9, v9
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; CHECK-NEXT:    v_rcp_f32_e32 v5, v5
+; CHECK-NEXT:    v_rcp_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; CHECK-NEXT:    v_sub_f32_e32 v7, v8, v7
 ; CHECK-NEXT:    v_sub_f32_e32 v6, v9, v6
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 25071eb767851..150d8cfe22cfd 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -577,7 +577,7 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v5, 0, v0
 ; GFX9-NEXT:    v_max_i32_e32 v5, v5, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -609,7 +609,7 @@ define i32 @udiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -641,7 +641,7 @@ define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v4, 0, v0
 ; GFX9-NEXT:    v_max_i32_e32 v4, v4, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -670,7 +670,7 @@ define i32 @urem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v1
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -1156,7 +1156,7 @@ define i64 @udiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v3, 0, v2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v1
diff --git a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
index 6f9531ecfa1b8..1f751ecbe6645 100644
--- a/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
+++ b/llvm/test/CodeGen/AMDGPU/idiv-licm.ll
@@ -12,7 +12,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -53,7 +53,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s6
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
@@ -97,7 +97,7 @@ define amdgpu_kernel void @udiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -162,7 +162,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    s_sub_i32 s2, 0, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -201,7 +201,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s6
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v0
@@ -243,7 +243,7 @@ define amdgpu_kernel void @urem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -309,7 +309,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_ashr_i32 s4, s6, 31
 ; GFX9-NEXT:    s_sub_i32 s5, 0, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
@@ -352,7 +352,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_ashr_i32 s3, s3, 31
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    s_sub_i32 s4, 0, s2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s5, v0
@@ -398,7 +398,7 @@ define amdgpu_kernel void @sdiv32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s4, 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -464,7 +464,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
@@ -502,7 +502,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
@@ -543,7 +543,7 @@ define amdgpu_kernel void @srem32_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -605,7 +605,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB4_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s2
@@ -636,7 +636,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX10-NEXT:    s_mov_b32 s2, 0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB4_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_and_b32 s3, 0xffff, s2
@@ -667,7 +667,7 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX11-NEXT:    s_mov_b32 s2, 0
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB4_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -718,7 +718,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_and_b32 s2, s0, 0xffff
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB5_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s3
@@ -751,7 +751,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB5_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s3
@@ -784,7 +784,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xffff
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB5_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -838,7 +838,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB6_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_sext_i32_i16 s4, s3
@@ -875,7 +875,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB6_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s3
@@ -912,7 +912,7 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB6_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -969,7 +969,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX9-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v0, s2
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:  .LBB7_1: ; %bb3
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_sext_i32_i16 s6, s3
@@ -1008,7 +1008,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX10-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX10-NEXT:  .LBB7_1: ; %bb3
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_sext_i32_i16 s4, s3
@@ -1047,7 +1047,7 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
 ; GFX11-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, s2
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX11-NEXT:    .p2align 6
 ; GFX11-NEXT:  .LBB7_1: ; %bb3
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
index fe0892788ca84..33424b7f0d16e 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll
@@ -426,7 +426,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
@@ -456,7 +456,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX90A-NEXT:    s_sub_i32 s4, 0, s3
-; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_readfirstlane_b32 s5, v0
@@ -485,7 +485,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX10-NEXT:    s_sub_i32 s5, 0, s3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s4, v0
@@ -516,7 +516,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-FLATSCR-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-FLATSCR-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX9-FLATSCR-NEXT:    s_sub_i32 s4, 0, s3
-; GFX9-FLATSCR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-FLATSCR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-FLATSCR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-FLATSCR-NEXT:    v_readfirstlane_b32 s5, v0
@@ -546,7 +546,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX11-NEXT:    s_sub_i32 s5, 0, s3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX11-NEXT:    s_waitcnt_depctr depctr_va_vdst(0)
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -582,8 +582,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX12-NEXT:    s_cvt_f32_u32 s4, s3
 ; GFX12-NEXT:    s_sub_co_i32 s5, 0, s3
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-NEXT:    v_rcp_iflag_f32_e32 v0, s4
-; GFX12-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX12-NEXT:    v_s_rcp_f32 s4, s4
 ; GFX12-NEXT:    s_mul_f32 s4, s4, 0x4f7ffffe
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
diff --git a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
index 20b450d383df4..4adc317a66c8d 100644
--- a/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
+++ b/llvm/test/CodeGen/AMDGPU/med3-knownbits.ll
@@ -48,7 +48,7 @@ define i32 @v_known_signbits_smed3(i16 %a, i16 %b) {
 ; SI-SDAG-NEXT:    s_movk_i32 s4, 0xffe0
 ; SI-SDAG-NEXT:    v_med3_i32 v0, v0, s4, 64
 ; SI-SDAG-NEXT:    v_cvt_f32_i32_e32 v3, v0
-; SI-SDAG-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; SI-SDAG-NEXT:    v_rcp_f32_e32 v4, v2
 ; SI-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; SI-SDAG-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; SI-SDAG-NEXT:    v_or_b32_e32 v0, 1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index c4322e260dece..9abd8110e5529 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -64,7 +64,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3)
 ; GFX9-NEXT:    v_add_u32_e32 v4, v4, v0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v6
 ; GFX9-NEXT:    v_lshl_add_u32 v6, v4, 2, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v7, v7
+; GFX9-NEXT:    v_rcp_f32_e32 v7, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v8, 2, v2
 ; GFX9-NEXT:    v_add_u32_e32 v9, v17, v12
 ; GFX9-NEXT:    s_mov_b64 s[10:11], 0
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index d21cda572f5f4..b54d80a03abd0 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -1503,12 +1503,12 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v15, v1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v10
+; GFX10-NEXT:    v_rcp_f32_e32 v15, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v16, v10
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v12
+; GFX10-NEXT:    v_rcp_f32_e32 v17, v12
 ; GFX10-NEXT:    v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v14
+; GFX10-NEXT:    v_rcp_f32_e32 v18, v14
 ; GFX10-NEXT:    v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
 ; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-NEXT:    v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
@@ -1581,10 +1581,10 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX9-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v4, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v12
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v13
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v15, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v16, v12
+; GFX9-NEXT:    v_rcp_f32_e32 v17, v13
+; GFX9-NEXT:    v_rcp_f32_e32 v18, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
 ; GFX9-NEXT:    v_mul_f32_e32 v16, v11, v16
 ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
@@ -1856,11 +1856,11 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v13
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v3
+; GFX10-NEXT:    v_rcp_f32_e32 v17, v2
+; GFX10-NEXT:    v_rcp_f32_e32 v18, v13
+; GFX10-NEXT:    v_rcp_f32_e32 v19, v3
 ; GFX10-NEXT:    v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v15
+; GFX10-NEXT:    v_rcp_f32_e32 v20, v15
 ; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX10-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
@@ -1938,7 +1938,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v17, v3
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
 ; GFX9-NEXT:    v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
 ; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
@@ -1946,8 +1946,8 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
 ; GFX9-NEXT:    v_mad_f32 v19, -v17, v3, v10
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v3|
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v3, v14
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v19, v10
+; GFX9-NEXT:    v_rcp_f32_e32 v3, v14
+; GFX9-NEXT:    v_rcp_f32_e32 v19, v10
 ; GFX9-NEXT:    v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, v13, v3
@@ -1960,7 +1960,7 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_mad_f32 v19, -v15, v10, v16
 ; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 s[6:7], |v19|, |v10|
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v10, v16
+; GFX9-NEXT:    v_rcp_f32_e32 v10, v16
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
 ; GFX9-NEXT:    v_xor_b32_sdwa v19, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v17, v17
@@ -2221,11 +2221,11 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v14, v0
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v11, v3
+; GFX10-NEXT:    v_rcp_f32_e32 v13, v9
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v15, v0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
+; GFX10-NEXT:    v_rcp_f32_e32 v12, v4
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x40207
 ; GFX10-NEXT:    v_mul_f32_e32 v10, v14, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v11, v4, v11
@@ -2274,18 +2274,18 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40207
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v11, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v11, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v12, v3
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v1, v9
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v10
+; GFX9-NEXT:    v_rcp_f32_e32 v13, v10
 ; GFX9-NEXT:    v_mul_f32_e32 v11, v1, v11
 ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v4, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v11, v11
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v4
+; GFX9-NEXT:    v_rcp_f32_e32 v14, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v12, v10, v12
 ; GFX9-NEXT:    v_mad_f32 v1, -v11, v2, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
@@ -2409,10 +2409,10 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v15, v0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
+; GFX10-NEXT:    v_rcp_f32_e32 v10, v1
+; GFX10-NEXT:    v_rcp_f32_e32 v11, v3
+; GFX10-NEXT:    v_rcp_f32_e32 v12, v4
+; GFX10-NEXT:    v_rcp_f32_e32 v13, v9
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
@@ -2470,15 +2470,15 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2050505
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v15, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v3
+; GFX9-NEXT:    v_rcp_f32_e32 v16, v3
 ; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v11, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v11
+; GFX9-NEXT:    v_rcp_f32_e32 v17, v11
 ; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v14, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v14
+; GFX9-NEXT:    v_rcp_f32_e32 v18, v14
 ; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16
 ; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
diff --git a/llvm/test/CodeGen/AMDGPU/pr155452.ll b/llvm/test/CodeGen/AMDGPU/pr155452.ll
index 928997e9fecb8..b51b6ee07aea5 100644
--- a/llvm/test/CodeGen/AMDGPU/pr155452.ll
+++ b/llvm/test/CodeGen/AMDGPU/pr155452.ll
@@ -19,7 +19,7 @@ define amdgpu_kernel void @my_kernel(i64 %foo, i32 %bar) {
 ; CHECK-NEXT:    s_abs_i32 s7, s0
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s7
 ; CHECK-NEXT:    s_sub_i32 s0, 0, s7
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_rcp_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
index 56bb3ce1742b8..6efd25289deec 100644
--- a/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
+++ b/llvm/test/CodeGen/AMDGPU/pseudo-scalar-transcendental.ll
@@ -762,26 +762,17 @@ define amdgpu_cs half @srcmods_neg_f16(half inreg %src) {
   ret half %result
 }
 
-; TODO: SelectionDAG should avoid generating v_rcp_iflag_f32.
 define amdgpu_cs float @fdiv_f32_i32(float inreg %a, i32 inreg %b) {
-; GFX12-SDAG-LABEL: fdiv_f32_i32:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_cvt_f32_u32 s1, s1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-SDAG-NEXT:    v_rcp_iflag_f32_e32 v0, s1
-; GFX12-SDAG-NEXT:    v_mul_f32_e32 v0, s0, v0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: fdiv_f32_i32:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_cvt_f32_u32 s1, s1
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
-; GFX12-GISEL-NEXT:    v_s_rcp_f32 s1, s1
-; GFX12-GISEL-NEXT:    s_mul_f32 s0, s0, s1
-; GFX12-GISEL-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: fdiv_f32_i32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_cvt_f32_u32 s1, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-NEXT:    v_s_rcp_f32 s1, s1
+; GFX12-NEXT:    s_mul_f32 s0, s0, s1
+; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
 ;
 ; GCN-GISEL-LABEL: fdiv_f32_i32:
 ; GCN-GISEL:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
index db3c902ec2416..e941186541642 100644
--- a/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
+++ b/llvm/test/CodeGen/AMDGPU/rcp_iflag.ll
@@ -1,8 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
 
-; GCN-LABEL: {{^}}rcp_uint:
-; GCN: v_rcp_iflag_f32_e32
 define amdgpu_kernel void @rcp_uint(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: rcp_uint:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %load = load i32, ptr addrspace(1) %in, align 4
   %cvt = uitofp i32 %load to float
   %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
@@ -10,9 +23,21 @@ define amdgpu_kernel void @rcp_uint(ptr addrspace(1) %in, ptr addrspace(1) %out)
   ret void
 }
 
-; GCN-LABEL: {{^}}rcp_sint:
-; GCN: v_rcp_iflag_f32_e32
 define amdgpu_kernel void @rcp_sint(ptr addrspace(1) %in, ptr addrspace(1) %out) #0 {
+; GCN-LABEL: rcp_sint:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %load = load i32, ptr addrspace(1) %in, align 4
   %cvt = sitofp i32 %load to float
   %div = fdiv float 1.000000e+00, %cvt, !fpmath !0
@@ -20,9 +45,31 @@ define amdgpu_kernel void @rcp_sint(ptr addrspace(1) %in, ptr addrspace(1) %out)
   ret void
 }
 
-; GCN-LABEL: {{^}}rcp_uint_denorm:
-; GCN-NOT: v_rcp_iflag_f32
 define amdgpu_kernel void @rcp_uint_denorm(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 {
+; GCN-LABEL: rcp_uint_denorm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GCN-NEXT:    v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v1
+; GCN-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GCN-NEXT:    v_fma_f32 v2, v3, v2, v2
+; GCN-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GCN-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GCN-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GCN-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GCN-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GCN-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %load = load i32, ptr addrspace(1) %in, align 4
   %cvt = uitofp i32 %load to float
   %div = fdiv float 1.000000e+00, %cvt
@@ -30,9 +77,31 @@ define amdgpu_kernel void @rcp_uint_denorm(ptr addrspace(1) %in, ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: {{^}}rcp_sint_denorm:
-; GCN-NOT: v_rcp_iflag_f32
 define amdgpu_kernel void @rcp_sint_denorm(ptr addrspace(1) %in, ptr addrspace(1) %out) #1 {
+; GCN-LABEL: rcp_sint_denorm:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s4, s2
+; GCN-NEXT:    s_mov_b32 s5, s3
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    v_cvt_f32_i32_e32 v0, s0
+; GCN-NEXT:    v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v1
+; GCN-NEXT:    v_fma_f32 v3, -v1, v2, 1.0
+; GCN-NEXT:    v_fma_f32 v2, v3, v2, v2
+; GCN-NEXT:    v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GCN-NEXT:    v_mul_f32_e32 v4, v3, v2
+; GCN-NEXT:    v_fma_f32 v5, -v1, v4, v3
+; GCN-NEXT:    v_fma_f32 v4, v5, v2, v4
+; GCN-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GCN-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
+; GCN-NEXT:    v_div_fixup_f32 v0, v1, v0, 1.0
+; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
   %load = load i32, ptr addrspace(1) %in, align 4
   %cvt = sitofp i32 %load to float
   %div = fdiv float 1.000000e+00, %cvt
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv.ll b/llvm/test/CodeGen/AMDGPU/sdiv.ll
index 441509ba01f64..b690879dab99b 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv.ll
@@ -33,7 +33,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_max_i32_e32 v5, v0, v5
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
@@ -77,7 +77,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; TONGA-NEXT:    v_rcp_f32_e32 v3, v3
 ; TONGA-NEXT:    v_max_i32_e32 v5, v0, v5
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v0
@@ -122,7 +122,7 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s5
 ; GFX9-NEXT:    s_xor_b32 s4, s6, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    s_abs_i32 s6, s6
 ; GFX9-NEXT:    s_ashr_i32 s4, s4, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
@@ -412,9 +412,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v9, v3
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GCN-NEXT:    v_rcp_f32_e32 v6, v6
 ; GCN-NEXT:    v_max_i32_e32 v0, v0, v5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v9
+; GCN-NEXT:    v_rcp_f32_e32 v5, v9
 ; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
 ; GCN-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
@@ -483,9 +483,9 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v6, v2
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v3
 ; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; TONGA-NEXT:    v_rcp_f32_e32 v6, v6
 ; TONGA-NEXT:    v_max_i32_e32 v0, v0, v5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v5, v9
+; TONGA-NEXT:    v_rcp_f32_e32 v5, v9
 ; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
 ; TONGA-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; TONGA-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
@@ -549,7 +549,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX9-NEXT:    s_xor_b32 s0, s5, s0
 ; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    s_sub_i32 s0, 0, s1
 ; GFX9-NEXT:    s_abs_i32 s5, s5
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v3
@@ -575,7 +575,7 @@ define amdgpu_kernel void @sdiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s6
 ; GFX9-NEXT:    s_mov_b32 s1, s9
 ; GFX9-NEXT:    s_sub_i32 s9, 0, s7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX9-NEXT:    s_mov_b32 s0, s8
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
@@ -812,13 +812,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; GCN-NEXT:    v_sub_i32_e32 v13, vcc, 0, v5
 ; GCN-NEXT:    v_xor_b32_e32 v11, v1, v5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GCN-NEXT:    v_rcp_f32_e32 v10, v10
 ; GCN-NEXT:    v_max_i32_e32 v5, v5, v13
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v13, v5
 ; GCN-NEXT:    v_sub_i32_e32 v16, vcc, 0, v4
 ; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v13, v13
+; GCN-NEXT:    v_rcp_f32_e32 v13, v13
 ; GCN-NEXT:    v_sub_i32_e32 v12, vcc, 0, v1
 ; GCN-NEXT:    v_mul_lo_u32 v16, v16, v10
 ; GCN-NEXT:    v_mul_f32_e32 v13, 0x4f7ffffe, v13
@@ -840,7 +840,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; GCN-NEXT:    v_mul_lo_u32 v13, v10, v4
 ; GCN-NEXT:    v_mul_hi_u32 v12, v1, v12
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v9, v15
+; GCN-NEXT:    v_rcp_f32_e32 v9, v15
 ; GCN-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
 ; GCN-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
 ; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v10
@@ -866,7 +866,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v4, v9, v4
 ; GCN-NEXT:    v_add_i32_e32 v13, vcc, 1, v10
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
 ; GCN-NEXT:    v_sub_i32_e32 v9, vcc, 0, v2
 ; GCN-NEXT:    v_max_i32_e32 v2, v2, v9
@@ -939,13 +939,13 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v4
 ; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, 0, v5
 ; TONGA-NEXT:    v_xor_b32_e32 v11, v1, v5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; TONGA-NEXT:    v_rcp_f32_e32 v10, v10
 ; TONGA-NEXT:    v_max_i32_e32 v5, v5, v13
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v13, v5
 ; TONGA-NEXT:    v_sub_u32_e32 v16, vcc, 0, v4
 ; TONGA-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v13, v13
+; TONGA-NEXT:    v_rcp_f32_e32 v13, v13
 ; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, 0, v1
 ; TONGA-NEXT:    v_mul_lo_u32 v16, v16, v10
 ; TONGA-NEXT:    v_mul_f32_e32 v13, 0x4f7ffffe, v13
@@ -967,7 +967,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_add_u32_e32 v12, vcc, v13, v12
 ; TONGA-NEXT:    v_mul_lo_u32 v13, v10, v4
 ; TONGA-NEXT:    v_mul_hi_u32 v12, v1, v12
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v9, v15
+; TONGA-NEXT:    v_rcp_f32_e32 v9, v15
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v14, 31, v14
 ; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, v0, v13
 ; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v10
@@ -993,7 +993,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v5
 ; TONGA-NEXT:    v_mul_hi_u32 v4, v9, v4
 ; TONGA-NEXT:    v_add_u32_e32 v13, vcc, 1, v10
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
 ; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v9, v4
 ; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v2
 ; TONGA-NEXT:    v_max_i32_e32 v2, v2, v9
@@ -1063,7 +1063,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_readfirstlane_b32 s5, v4
 ; GFX9-NEXT:    s_xor_b32 s0, s5, s0
 ; GFX9-NEXT:    s_ashr_i32 s6, s0, 31
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_sub_i32 s0, 0, s1
 ; GFX9-NEXT:    s_abs_i32 s5, s5
 ; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
@@ -1089,7 +1089,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_xor_b32 s1, s1, s6
 ; GFX9-NEXT:    s_sub_i32 s10, 0, s5
 ; GFX9-NEXT:    s_sub_i32 s6, s1, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_mov_b32 s0, s8
 ; GFX9-NEXT:    v_readfirstlane_b32 s8, v5
 ; GFX9-NEXT:    s_xor_b32 s4, s8, s4
@@ -1118,7 +1118,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s4
 ; GFX9-NEXT:    s_sub_i32 s11, 0, s8
 ; GFX9-NEXT:    s_sub_i32 s4, s5, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s10, v6
 ; GFX9-NEXT:    s_xor_b32 s7, s10, s7
 ; GFX9-NEXT:    s_abs_i32 s10, s10
@@ -1147,7 +1147,7 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_readfirstlane_b32 s10, v7
 ; GFX9-NEXT:    s_xor_b32 s5, s5, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX9-NEXT:    s_xor_b32 s4, s10, s9
 ; GFX9-NEXT:    s_sub_i32 s9, 0, s8
 ; GFX9-NEXT:    s_sub_i32 s5, s5, s7
@@ -1467,7 +1467,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1500,7 +1500,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; TONGA-NEXT:    v_rcp_f32_e32 v4, v2
 ; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1533,7 +1533,7 @@ define amdgpu_kernel void @v_sdiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v3, v4
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1620,7 +1620,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
@@ -1663,7 +1663,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_or_b32_e32 v0, v3, v0
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v0
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; TONGA-NEXT:    v_rcp_f32_e32 v4, v2
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
@@ -1706,7 +1706,7 @@ define amdgpu_kernel void @v_sdiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
 ; GFX9-NEXT:    v_bfe_i32 v0, v0, 0, 23
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v0
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
@@ -1807,7 +1807,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GCN-NEXT:    v_rcp_f32_e32 v4, v1
 ; GCN-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GCN-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GCN-NEXT:    v_or_b32_e32 v0, 1, v0
@@ -1848,7 +1848,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_or_b32_e32 v3, v3, v4
 ; TONGA-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; TONGA-NEXT:    v_rcp_f32_e32 v4, v1
 ; TONGA-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; TONGA-NEXT:    v_or_b32_e32 v0, 1, v0
@@ -1889,7 +1889,7 @@ define amdgpu_kernel void @v_sdiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_cvt_f32_i32_e32 v3, v3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v4, v1
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; GFX9-NEXT:    v_or_b32_e32 v0, 1, v0
@@ -1980,7 +1980,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v2
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 25
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_sub_i32_e32 v5, vcc, 0, v0
 ; GCN-NEXT:    v_max_i32_e32 v5, v0, v5
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -2027,7 +2027,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v3, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, 0, v2
 ; TONGA-NEXT:    v_bfe_i32 v0, v0, 0, 25
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; TONGA-NEXT:    v_rcp_f32_e32 v3, v3
 ; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
 ; TONGA-NEXT:    v_max_i32_e32 v5, v0, v5
 ; TONGA-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
@@ -2074,7 +2074,7 @@ define amdgpu_kernel void @v_sdiv_i25(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
 ; GFX9-NEXT:    s_mov_b32 s0, s8
 ; GFX9-NEXT:    s_sub_i32 s7, 0, s5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX9-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX9-NEXT:    s_bfe_i32 s6, s6, 0x190000
 ; GFX9-NEXT:    s_xor_b32 s4, s6, s4
 ; GFX9-NEXT:    s_abs_i32 s6, s6
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 50f6acf3f85a2..68466abf31aa0 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -492,7 +492,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_ashr_i32 s1, s3, 8
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_or_b32 s2, s0, 1
@@ -523,7 +523,7 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s3, 8
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s0, s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
@@ -554,7 +554,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -581,7 +581,7 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; GCN-IR-NEXT:    v_sub_i32_e32 v3, vcc, 0, v0
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_abs_i32 s0, s3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_xor_b32 s1, s3, s8
 ; GCN-NEXT:    s_ashr_i32 s1, s1, 31
@@ -660,7 +660,7 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s9
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_abs_i32 s0, s3
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_xor_b32 s1, s3, s8
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s1, 31
@@ -708,7 +708,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s3, 1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -752,7 +752,7 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s9
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -805,7 +805,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_ashr_i32 s1, s3, 9
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s0, s1, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_or_b32 s2, s0, 1
@@ -836,7 +836,7 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_ashr_i32 s1, s3, 9
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s0, s1, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
@@ -872,7 +872,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s9
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s3, 7
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -916,7 +916,7 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s9
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 7
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -967,7 +967,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-NEXT:    s_ashr_i32 s5, s9, 8
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GCN-NEXT:    s_xor_b32 s4, s5, s4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-NEXT:    s_ashr_i32 s6, s11, 8
 ; GCN-NEXT:    s_ashr_i32 s7, s15, 8
@@ -984,7 +984,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-NEXT:    s_xor_b32 s4, s6, s7
 ; GCN-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -1013,7 +1013,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    s_ashr_i32 s5, s9, 8
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s5
 ; GCN-IR-NEXT:    s_xor_b32 s4, s5, s4
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
 ; GCN-IR-NEXT:    s_ashr_i32 s6, s11, 8
 ; GCN-IR-NEXT:    s_ashr_i32 s7, s15, 8
@@ -1030,7 +1030,7 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, s6
 ; GCN-IR-NEXT:    s_xor_b32 s4, s6, s7
 ; GCN-IR-NEXT:    s_ashr_i32 s4, s4, 30
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-IR-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-IR-NEXT:    s_or_b32 s6, s4, 1
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
@@ -1069,7 +1069,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_sext_i32_i16 s3, s3
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s0, s2, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_or_b32 s2, s0, 1
@@ -1102,7 +1102,7 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
 ; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s0, s2, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    s_or_b32 s2, s0, 1
@@ -1834,7 +1834,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-NEXT:    s_mov_b32 s2, 0x41c00000
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s3, 31
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_or_b32 s3, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
@@ -1861,7 +1861,7 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    s_mov_b32 s2, 0x41c00000
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_or_b32 s3, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
@@ -1949,7 +1949,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v3, -v2, v0, s4
@@ -1969,7 +1969,7 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v0, s4
@@ -1994,7 +1994,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mad_f32 v3, -v2, v0, s4
@@ -2014,7 +2014,7 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mad_f32 v3, -v2, v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
index cee4dac6afc27..abfb56a58221f 100644
--- a/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}sdiv24_i8:
 ; SI: v_cvt_f32_i32
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_cvt_i32_f32
 
 ; EG: INT_TO_FLT
@@ -24,7 +24,7 @@ define amdgpu_kernel void @sdiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; FUNC-LABEL: {{^}}sdiv24_i16:
 ; SI: v_cvt_f32_i32
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_cvt_i32_f32
 
 ; EG: INT_TO_FLT
@@ -43,7 +43,7 @@ define amdgpu_kernel void @sdiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; FUNC-LABEL: {{^}}sdiv24_i32:
 ; SI: v_cvt_f32_i32
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_cvt_i32_f32
 
 ; EG: INT_TO_FLT
@@ -65,7 +65,6 @@ define amdgpu_kernel void @sdiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 
 ; FUNC-LABEL: {{^}}sdiv25_i32:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -84,7 +83,6 @@ define amdgpu_kernel void @sdiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 
 ; FUNC-LABEL: {{^}}test_no_sdiv24_i32_1:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -103,7 +101,6 @@ define amdgpu_kernel void @test_no_sdiv24_i32_1(ptr addrspace(1) %out, ptr addrs
 
 ; FUNC-LABEL: {{^}}test_no_sdiv24_i32_2:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -123,7 +120,7 @@ define amdgpu_kernel void @test_no_sdiv24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; FUNC-LABEL: {{^}}srem24_i8:
 ; SI: v_cvt_f32_i32
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_cvt_i32_f32
 
 ; EG: INT_TO_FLT
@@ -142,7 +139,7 @@ define amdgpu_kernel void @srem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; FUNC-LABEL: {{^}}srem24_i16:
 ; SI: v_cvt_f32_i32
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_cvt_i32_f32
 
 ; EG: INT_TO_FLT
@@ -161,7 +158,7 @@ define amdgpu_kernel void @srem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; FUNC-LABEL: {{^}}srem24_i32:
 ; SI: v_cvt_f32_i32
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_cvt_i32_f32
 
 ; EG: INT_TO_FLT
@@ -183,7 +180,6 @@ define amdgpu_kernel void @srem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 
 ; FUNC-LABEL: {{^}}no_srem25_i32:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -202,7 +198,6 @@ define amdgpu_kernel void @no_srem25_i32(ptr addrspace(1) %out, ptr addrspace(1)
 
 ; FUNC-LABEL: {{^}}no_sdiv25_i24_i25_i32:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -221,7 +216,6 @@ define amdgpu_kernel void @no_sdiv25_i24_i25_i32(ptr addrspace(1) %out, ptr addr
 
 ; FUNC-LABEL: {{^}}no_sdiv25_i25_i24_i32:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -240,7 +234,6 @@ define amdgpu_kernel void @no_sdiv25_i25_i24_i32(ptr addrspace(1) %out, ptr addr
 
 ; FUNC-LABEL: {{^}}no_srem25_i24_i25_i32:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -259,7 +252,6 @@ define amdgpu_kernel void @no_srem25_i24_i25_i32(ptr addrspace(1) %out, ptr addr
 
 ; FUNC-LABEL: {{^}}no_srem25_i25_i24_i32:
 ; SI-NOT: v_cvt_f32_i32
-; SI-NOT: v_rcp_f32
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
@@ -278,7 +270,7 @@ define amdgpu_kernel void @no_srem25_i25_i24_i32(ptr addrspace(1) %out, ptr addr
 
 ; FUNC-LABEL: {{^}}srem25_i24_i11_i32:
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
 
 ; EG: INT_TO_FLT
@@ -298,7 +290,7 @@ define amdgpu_kernel void @srem25_i24_i11_i32(ptr addrspace(1) %out, ptr addrspa
 
 ; FUNC-LABEL: {{^}}srem25_i11_i24_i32:
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 24
 
 ; EG: INT_TO_FLT
@@ -318,7 +310,7 @@ define amdgpu_kernel void @srem25_i11_i24_i32(ptr addrspace(1) %out, ptr addrspa
 
 ; FUNC-LABEL: {{^}}srem25_i17_i12_i32:
 ; SI: v_cvt_f32_i32
-; SI: v_rcp_iflag_f32
+; SI: v_rcp_f32
 ; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 17
 
 ; EG: INT_TO_FLT
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index 90345993de473..c870d651eb1aa 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -124,7 +124,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_readfirstlane_b32 s3, v0
 ; GCN-NEXT:    s_sub_i32 s5, 0, s2
 ; GCN-NEXT:    s_ashr_i32 s4, s3, 31
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    s_abs_i32 s3, s3
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -165,7 +165,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TAHITI-NEXT:    s_abs_i32 s2, s2
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; TAHITI-NEXT:    s_sub_i32 s3, 0, s2
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; TAHITI-NEXT:    v_rcp_f32_e32 v1, v1
 ; TAHITI-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; TAHITI-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; TAHITI-NEXT:    v_mul_lo_u32 v2, s3, v1
@@ -202,7 +202,7 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_abs_i32 s2, s2
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; TONGA-NEXT:    s_sub_i32 s3, 0, s2
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; TONGA-NEXT:    v_rcp_f32_e32 v1, v1
 ; TONGA-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; TONGA-NEXT:    v_mul_lo_u32 v2, s3, v1
@@ -470,7 +470,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_sub_i32 s6, 0, s2
 ; GCN-NEXT:    s_ashr_i32 s5, s4, 31
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    s_abs_i32 s4, s4
 ; GCN-NEXT:    v_readfirstlane_b32 s3, v3
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
@@ -493,7 +493,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_xor_b32 s2, s2, s5
 ; GCN-NEXT:    s_sub_i32 s7, 0, s3
 ; GCN-NEXT:    s_sub_i32 s2, s2, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v1
 ; GCN-NEXT:    s_ashr_i32 s6, s4, 31
 ; GCN-NEXT:    s_abs_i32 s4, s4
@@ -536,7 +536,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v2, s0
 ; TAHITI-NEXT:    s_sub_i32 s1, 0, s0
 ; TAHITI-NEXT:    v_readfirstlane_b32 s7, v3
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; TAHITI-NEXT:    v_rcp_f32_e32 v2, v2
 ; TAHITI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; TAHITI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; TAHITI-NEXT:    v_mul_lo_u32 v4, s1, v2
@@ -560,7 +560,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    s_sub_i32 s0, 0, s7
 ; TAHITI-NEXT:    s_mov_b32 s1, s5
 ; TAHITI-NEXT:    s_xor_b32 s6, s6, s8
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TAHITI-NEXT:    v_rcp_f32_e32 v0, v0
 ; TAHITI-NEXT:    s_sub_i32 s6, s6, s8
 ; TAHITI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; TAHITI-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -602,7 +602,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_sub_i32 s3, 0, s2
 ; TONGA-NEXT:    v_readfirstlane_b32 s5, v3
 ; TONGA-NEXT:    v_mov_b32_e32 v3, s1
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; TONGA-NEXT:    v_rcp_f32_e32 v2, v2
 ; TONGA-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; TONGA-NEXT:    v_mul_lo_u32 v4, s3, v2
@@ -628,7 +628,7 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_sub_i32 s5, 0, s4
 ; TONGA-NEXT:    s_abs_i32 s1, s0
 ; TONGA-NEXT:    s_xor_b32 s2, s2, s3
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
 ; TONGA-NEXT:    s_sub_i32 s2, s2, s3
 ; TONGA-NEXT:    s_ashr_i32 s0, s0, 31
 ; TONGA-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -855,7 +855,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v5
 ; GCN-NEXT:    s_ashr_i32 s5, s4, 31
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    s_abs_i32 s4, s4
 ; GCN-NEXT:    v_readfirstlane_b32 s3, v2
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
@@ -878,7 +878,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_xor_b32 s2, s2, s5
 ; GCN-NEXT:    s_sub_i32 s8, 0, s3
 ; GCN-NEXT:    s_sub_i32 s2, s2, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v6
 ; GCN-NEXT:    s_ashr_i32 s7, s6, 31
 ; GCN-NEXT:    s_abs_i32 s6, s6
@@ -903,7 +903,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_xor_b32 s3, s3, s7
 ; GCN-NEXT:    s_sub_i32 s9, 0, s4
 ; GCN-NEXT:    s_sub_i32 s3, s3, s7
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GCN-NEXT:    v_rcp_f32_e32 v1, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v7
 ; GCN-NEXT:    s_ashr_i32 s8, s6, 31
 ; GCN-NEXT:    s_abs_i32 s6, s6
@@ -928,7 +928,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v8
 ; GCN-NEXT:    v_mov_b32_e32 v1, s2
 ; GCN-NEXT:    s_ashr_i32 s2, s6, 31
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-NEXT:    v_rcp_f32_e32 v3, v2
 ; GCN-NEXT:    v_mov_b32_e32 v2, s3
 ; GCN-NEXT:    s_abs_i32 s3, s6
 ; GCN-NEXT:    s_sub_i32 s6, 0, s5
@@ -974,7 +974,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; TAHITI-NEXT:    s_sub_i32 s1, 0, s0
 ; TAHITI-NEXT:    v_readfirstlane_b32 s7, v1
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TAHITI-NEXT:    v_rcp_f32_e32 v0, v0
 ; TAHITI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; TAHITI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; TAHITI-NEXT:    v_mul_lo_u32 v8, s1, v0
@@ -999,7 +999,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    s_sub_i32 s7, 0, s6
 ; TAHITI-NEXT:    v_readfirstlane_b32 s8, v5
 ; TAHITI-NEXT:    s_abs_i32 s9, s8
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TAHITI-NEXT:    v_rcp_f32_e32 v0, v0
 ; TAHITI-NEXT:    s_xor_b32 s0, s0, s1
 ; TAHITI-NEXT:    s_sub_i32 s10, s0, s1
 ; TAHITI-NEXT:    s_ashr_i32 s8, s8, 31
@@ -1024,7 +1024,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    s_sub_i32 s6, 0, s1
 ; TAHITI-NEXT:    v_readfirstlane_b32 s7, v6
 ; TAHITI-NEXT:    s_abs_i32 s9, s7
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TAHITI-NEXT:    v_rcp_f32_e32 v0, v0
 ; TAHITI-NEXT:    s_xor_b32 s0, s0, s8
 ; TAHITI-NEXT:    s_sub_i32 s8, s0, s8
 ; TAHITI-NEXT:    s_ashr_i32 s7, s7, 31
@@ -1049,7 +1049,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TAHITI-NEXT:    s_sub_i32 s1, 0, s6
 ; TAHITI-NEXT:    s_mov_b32 s0, s4
 ; TAHITI-NEXT:    v_readfirstlane_b32 s4, v7
-; TAHITI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TAHITI-NEXT:    v_rcp_f32_e32 v0, v0
 ; TAHITI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; TAHITI-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; TAHITI-NEXT:    v_mov_b32_e32 v0, s10
@@ -1097,7 +1097,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; TONGA-NEXT:    s_sub_i32 s3, 0, s2
 ; TONGA-NEXT:    v_readfirstlane_b32 s5, v1
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; TONGA-NEXT:    v_mul_lo_u32 v8, s3, v0
@@ -1123,7 +1123,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_sub_i32 s5, 0, s4
 ; TONGA-NEXT:    v_readfirstlane_b32 s6, v5
 ; TONGA-NEXT:    s_abs_i32 s7, s6
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
 ; TONGA-NEXT:    s_xor_b32 s2, s2, s3
 ; TONGA-NEXT:    s_sub_i32 s2, s2, s3
 ; TONGA-NEXT:    s_ashr_i32 s6, s6, 31
@@ -1149,7 +1149,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_sub_i32 s5, 0, s4
 ; TONGA-NEXT:    v_readfirstlane_b32 s7, v6
 ; TONGA-NEXT:    s_abs_i32 s8, s7
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
 ; TONGA-NEXT:    s_xor_b32 s3, s3, s6
 ; TONGA-NEXT:    s_sub_i32 s3, s3, s6
 ; TONGA-NEXT:    s_ashr_i32 s7, s7, 31
@@ -1172,7 +1172,7 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_abs_i32 s5, s5
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; TONGA-NEXT:    s_sub_i32 s0, 0, s5
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
 ; TONGA-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v2, v0
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 8880bc9bb2057..82196b73b66e4 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -479,7 +479,7 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_xor_b32 s0, s3, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_or_b32 s8, s0, 1
@@ -515,7 +515,7 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_xor_b32 s0, s3, s2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_or_b32 s8, s0, 1
@@ -558,7 +558,7 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_xor_b32 s0, s3, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_or_b32 s8, s0, 1
@@ -594,7 +594,7 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_xor_b32 s0, s3, s2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s0, 30
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_or_b32 s8, s0, 1
@@ -631,7 +631,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 8, v1
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GCN-NEXT:    v_xor_b32_e32 v5, v1, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
 ; GCN-NEXT:    v_or_b32_e32 v5, 1, v5
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
@@ -655,7 +655,7 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; GCN-IR-NEXT:    v_xor_b32_e32 v5, v1, v0
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-IR-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
 ; GCN-IR-NEXT:    v_or_b32_e32 v5, 1, v5
 ; GCN-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
@@ -689,7 +689,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -730,7 +730,7 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s8
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -778,7 +778,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_sub_i32 s2, 0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -819,7 +819,7 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    s_sub_i32 s2, 0, s8
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -865,7 +865,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_abs_i32 s8, s0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -904,7 +904,7 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_abs_i32 s8, s0
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -1216,7 +1216,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-NEXT:    s_sext_i32_i16 s3, s3
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_xor_b32 s3, s2, s4
 ; GCN-NEXT:    s_ashr_i32 s3, s3, 30
 ; GCN-NEXT:    s_or_b32 s3, s3, 1
@@ -1251,7 +1251,7 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    s_sext_i32_i16 s3, s3
 ; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-IR-NEXT:    v_cvt_f32_i32_e32 v1, s2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_xor_b32 s3, s2, s4
 ; GCN-IR-NEXT:    s_ashr_i32 s3, s3, 30
 ; GCN-IR-NEXT:    s_or_b32 s3, s3, 1
@@ -1991,7 +1991,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_ashr_i32 s0, s3, 31
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-NEXT:    s_or_b32 s3, s0, 1
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
@@ -2023,7 +2023,7 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_ashr_i32 s0, s3, 31
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-IR-NEXT:    s_or_b32 s3, s0, 1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
@@ -2125,7 +2125,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-NEXT:    v_rcp_f32_e32 v3, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x41c00000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mad_f32 v4, -v3, v2, s4
@@ -2147,7 +2147,7 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_f32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mul_f32_e32 v3, 0x41c00000, v3
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-IR-NEXT:    v_mad_f32 v4, -v3, v2, s4
@@ -2174,7 +2174,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-NEXT:    v_rcp_f32_e32 v3, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x47000000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mad_f32 v4, -v3, v2, s4
@@ -2196,7 +2196,7 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
 ; GCN-IR-NEXT:    v_or_b32_e32 v1, 1, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GCN-IR-NEXT:    v_rcp_f32_e32 v3, v2
 ; GCN-IR-NEXT:    v_mul_f32_e32 v3, 0x47000000, v3
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-IR-NEXT:    v_mad_f32 v4, -v3, v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index dd2acb8de6f41..c0918dd78be5f 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -22,7 +22,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, 0, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; SI-NEXT:    v_rcp_f32_e32 v2, v2
 ; SI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; SI-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -58,7 +58,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; VI-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; VI-NEXT:    v_rcp_f32_e32 v2, v2
 ; VI-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; VI-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -91,7 +91,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, v1
 ; GCN-NEXT:    v_sub_u32_e32 v3, vcc, 0, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v3, v2
@@ -124,7 +124,7 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX1030-NEXT:    s_sub_i32 s4, 0, s2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s3, v1
@@ -194,7 +194,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; SI-NEXT:    s_sub_i32 s2, 0, s5
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s2, v0
@@ -226,7 +226,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; VI-NEXT:    s_sub_i32 s2, 0, s5
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s2, v0
@@ -259,7 +259,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GCN-NEXT:    s_sub_i32 s4, 0, s3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s4, v0
@@ -290,7 +290,7 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
 ; GFX1030-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s3
 ; GFX1030-NEXT:    s_sub_i32 s5, 0, s3
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX1030-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v0
@@ -365,8 +365,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; SI-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; SI-NEXT:    v_sub_i32_e32 v6, vcc, 0, v2
-; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; SI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; SI-NEXT:    v_rcp_f32_e32 v4, v4
+; SI-NEXT:    v_rcp_f32_e32 v5, v5
 ; SI-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
 ; SI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; SI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
@@ -420,8 +420,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; VI-NEXT:    v_cvt_f32_u32_e32 v5, v3
 ; VI-NEXT:    v_sub_u32_e32 v6, vcc, 0, v2
-; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; VI-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; VI-NEXT:    v_rcp_f32_e32 v4, v4
+; VI-NEXT:    v_rcp_f32_e32 v5, v5
 ; VI-NEXT:    v_sub_u32_e32 v7, vcc, 0, v3
 ; VI-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; VI-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
@@ -471,8 +471,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v4, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; GCN-NEXT:    v_rcp_f32_e32 v4, v4
+; GCN-NEXT:    v_rcp_f32_e32 v5, v5
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v6, v4
 ; GCN-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v5
@@ -525,8 +525,8 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, s2
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v3, s3
 ; GFX1030-NEXT:    s_sub_i32 s5, 0, s2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX1030-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX1030-NEXT:    v_rcp_f32_e32 v3, v3
 ; GFX1030-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v3
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v2, v2
@@ -643,10 +643,10 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cvt_f32_u32_e32 v10, v1
 ; SI-NEXT:    v_cvt_f32_u32_e32 v12, v2
 ; SI-NEXT:    v_cvt_f32_u32_e32 v14, v3
-; SI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; SI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; SI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; SI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
+; SI-NEXT:    v_rcp_f32_e32 v8, v8
+; SI-NEXT:    v_rcp_f32_e32 v10, v10
+; SI-NEXT:    v_rcp_f32_e32 v12, v12
+; SI-NEXT:    v_rcp_f32_e32 v14, v14
 ; SI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; SI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; SI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
@@ -738,10 +738,10 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_u32_e32 v10, v1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v12, v2
 ; VI-NEXT:    v_cvt_f32_u32_e32 v14, v3
-; VI-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; VI-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; VI-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; VI-NEXT:    v_rcp_iflag_f32_e32 v14, v14
+; VI-NEXT:    v_rcp_f32_e32 v8, v8
+; VI-NEXT:    v_rcp_f32_e32 v10, v10
+; VI-NEXT:    v_rcp_f32_e32 v12, v12
+; VI-NEXT:    v_rcp_f32_e32 v14, v14
 ; VI-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; VI-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; VI-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
@@ -836,10 +836,10 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v12, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v14, v2
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v16, v3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v10, v10
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v12, v12
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v14, v14
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v16, v16
+; GCN-NEXT:    v_rcp_f32_e32 v10, v10
+; GCN-NEXT:    v_rcp_f32_e32 v12, v12
+; GCN-NEXT:    v_rcp_f32_e32 v14, v14
+; GCN-NEXT:    v_rcp_f32_e32 v16, v16
 ; GCN-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
 ; GCN-NEXT:    v_mul_f32_e32 v12, 0x4f7ffffe, v12
 ; GCN-NEXT:    v_mul_f32_e32 v14, 0x4f7ffffe, v14
@@ -929,8 +929,8 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX1030-NEXT:    s_sub_i32 s6, 0, s2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX1030-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1030-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX1030-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX1030-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -939,7 +939,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; GFX1030-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX1030-NEXT:    s_mul_i32 s6, s6, s4
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX1030-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030-NEXT:    s_mul_hi_u32 s6, s4, s6
 ; GFX1030-NEXT:    s_add_i32 s4, s4, s6
 ; GFX1030-NEXT:    s_mul_hi_u32 s4, s7, s4
@@ -965,7 +965,7 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    s_mul_hi_u32 s6, s7, s9
 ; GFX1030-NEXT:    v_readfirstlane_b32 s10, v0
 ; GFX1030-NEXT:    s_mul_i32 s8, s6, s3
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1030-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1030-NEXT:    s_sub_i32 s7, s7, s8
 ; GFX1030-NEXT:    s_add_i32 s8, s6, 1
 ; GFX1030-NEXT:    s_sub_i32 s9, s7, s3
@@ -1404,7 +1404,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(1)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -1433,7 +1433,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -1460,7 +1460,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v3
+; GCN-NEXT:    v_rcp_f32_e32 v4, v3
 ; GCN-NEXT:    v_cvt_f32_ubyte0_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1481,7 +1481,7 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    v_cvt_f32_ubyte1_e32 v2, v1
 ; GFX1030-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX1030-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
@@ -1549,7 +1549,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1578,7 +1578,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1604,7 +1604,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GCN-NEXT:    v_cvt_f32_u32_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v5, v4
@@ -1624,7 +1624,7 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    v_cvt_f32_u32_sdwa v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX1030-NEXT:    v_cvt_f32_u32_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX1030-NEXT:    v_rcp_f32_e32 v3, v2
 ; GFX1030-NEXT:    v_mul_f32_e32 v3, v1, v3
 ; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
 ; GFX1030-NEXT:    v_fma_f32 v1, -v3, v2, v1
@@ -1700,7 +1700,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1737,7 +1737,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1783,7 +1783,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
@@ -1815,7 +1815,7 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX1030-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1900,7 +1900,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; SI-NEXT:    v_rcp_f32_e32 v2, v0
 ; SI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1937,7 +1937,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1983,7 +1983,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v3, v0
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GCN-NEXT:    v_rcp_f32_e32 v4, v2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mul_f32_e32 v4, v3, v4
@@ -2015,7 +2015,7 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    v_or_b32_e32 v2, v4, v2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX1030-NEXT:    v_rcp_f32_e32 v3, v1
 ; GFX1030-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX1030-NEXT:    v_mul_f32_e32 v3, v2, v3
 ; GFX1030-NEXT:    v_trunc_f32_e32 v3, v3
@@ -2373,7 +2373,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; SI-NEXT:    v_xor_b32_e32 v0, v1, v0
-; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; SI-NEXT:    v_rcp_f32_e32 v4, v2
 ; SI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; SI-NEXT:    v_or_b32_e32 v0, 1, v0
 ; SI-NEXT:    v_mul_f32_e32 v1, v3, v4
@@ -2402,7 +2402,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_i32_e32 v3, v1
 ; VI-NEXT:    v_xor_b32_e32 v0, v1, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; VI-NEXT:    v_rcp_f32_e32 v4, v2
 ; VI-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
 ; VI-NEXT:    v_or_b32_e32 v0, 1, v0
 ; VI-NEXT:    v_mul_f32_e32 v1, v3, v4
@@ -2432,7 +2432,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v4, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_i32_e32 v5, v3
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v6, v4
+; GCN-NEXT:    v_rcp_f32_e32 v6, v4
 ; GCN-NEXT:    v_xor_b32_e32 v2, v3, v2
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
 ; GCN-NEXT:    v_or_b32_e32 v2, 1, v2
@@ -2457,7 +2457,7 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
 ; GFX1030-NEXT:    global_load_sbyte v3, v[0:1], off
 ; GFX1030-NEXT:    s_waitcnt vmcnt(1)
 ; GFX1030-NEXT:    v_cvt_f32_i32_e32 v4, v2
-; GFX1030-NEXT:    v_rcp_iflag_f32_e32 v5, v4
+; GFX1030-NEXT:    v_rcp_f32_e32 v5, v4
 ; GFX1030-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1030-NEXT:    v_cvt_f32_i32_e32 v6, v3
 ; GFX1030-NEXT:    v_xor_b32_e32 v2, v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index c2bd180fedcca..edd84a5f09e5e 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -405,7 +405,7 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -430,7 +430,7 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -458,7 +458,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -476,7 +476,7 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -501,7 +501,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -537,7 +537,7 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -581,7 +581,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_lshr_b32 s8, s0, 1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -619,7 +619,7 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_lshr_b32 s8, s0, 1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -666,7 +666,7 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 9
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -691,7 +691,7 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_lshr_b32 s2, s3, 9
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GCN-IR-NEXT:    s_mov_b32 s6, -1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, s0
 ; GCN-IR-NEXT:    s_mov_b32 s5, s1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -726,7 +726,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-NEXT:    s_and_b32 s2, s2, 0xff000000
 ; GCN-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -754,7 +754,7 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    s_and_b32 s2, s2, 0xff000000
 ; GCN-IR-NEXT:    s_lshr_b64 s[2:3], s[2:3], 24
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s2
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -1446,7 +1446,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NEXT:    s_mov_b32 s2, -1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1467,7 +1467,7 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-IR-NEXT:    s_mov_b32 s2, -1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1541,7 +1541,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1558,7 +1558,7 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1580,7 +1580,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x47000000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1597,7 +1597,7 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x47000000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll
index eaab3531824c4..c6aec64d1692e 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll
@@ -44,7 +44,7 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s8
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -82,7 +82,7 @@ define amdgpu_kernel void @test_udivrem(ptr addrspace(1) %out0, [8 x i32], ptr a
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s6
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -171,8 +171,8 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX6-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -218,8 +218,8 @@ define amdgpu_kernel void @test_udivrem_v2(ptr addrspace(1) %out, <2 x i32> %x,
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX8-NEXT:    s_sub_i32 s6, 0, s2
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -340,8 +340,8 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v2, s13
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -362,7 +362,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
 ; GFX6-NEXT:    s_cselect_b32 s6, s1, s0
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s13
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
@@ -380,7 +380,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
 ; GFX6-NEXT:    s_cselect_b32 s7, s1, s0
 ; GFX6-NEXT:    s_sub_i32 s0, 0, s14
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s0, v1
-; GFX6-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX6-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
@@ -425,8 +425,8 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s12
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v2, s13
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -447,7 +447,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
 ; GFX8-NEXT:    s_cselect_b32 s2, s1, s0
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s13
 ; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v1
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s9, v0
@@ -465,7 +465,7 @@ define amdgpu_kernel void @test_udivrem_v4(ptr addrspace(1) %out, <4 x i32> %x,
 ; GFX8-NEXT:    s_cselect_b32 s3, s1, s0
 ; GFX8-NEXT:    s_sub_i32 s0, 0, s14
 ; GFX8-NEXT:    v_mul_lo_u32 v0, s0, v1
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX8-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v1, v0
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v1, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s10, v0
diff --git a/llvm/test/CodeGen/AMDGPU/udivrem24.ll b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
index 935a9bf23c9cb..4d9ece813c402 100644
--- a/llvm/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/llvm/test/CodeGen/AMDGPU/udivrem24.ll
@@ -22,7 +22,7 @@ define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -48,7 +48,7 @@ define amdgpu_kernel void @udiv24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -125,7 +125,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out,
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -151,7 +151,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(ptr addrspace(1) %out,
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -228,7 +228,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -254,7 +254,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_in(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -331,7 +331,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -357,7 +357,7 @@ define amdgpu_kernel void @udiv24_i8_denorm_flush_out(ptr addrspace(1) %out, ptr
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
@@ -434,7 +434,7 @@ define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -462,7 +462,7 @@ define amdgpu_kernel void @udiv24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -531,7 +531,7 @@ define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b32 s5, s5, 0x7fffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -553,7 +553,7 @@ define amdgpu_kernel void @udiv23_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -622,7 +622,7 @@ define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -644,7 +644,7 @@ define amdgpu_kernel void @udiv24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -716,7 +716,7 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addr
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -738,7 +738,7 @@ define amdgpu_kernel void @no_udiv24_u23_u24_i32(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -810,7 +810,7 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr
 ; SI-NEXT:    s_and_b32 s5, s5, 0x7fffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -832,7 +832,7 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -891,7 +891,6 @@ define amdgpu_kernel void @no_udiv24_u24_u23_i32(ptr addrspace(1) %out, ptr addr
   ret void
 }
 
-; RCP_IFLAG is for URECIP in the full 32b alg
 define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: udiv25_i32:
 ; SI:       ; %bb.0:
@@ -905,7 +904,7 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b32 s5, s5, 0x1ffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; SI-NEXT:    s_sub_i32 s6, 0, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -939,7 +938,7 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_sub_i32 s3, 0, s4
 ; VI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s3, v0
@@ -1010,7 +1009,6 @@ define amdgpu_kernel void @udiv25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
   ret void
 }
 
-; RCP_IFLAG is for URECIP in the full 32b alg
 define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: test_no_udiv24_i32_1:
 ; SI:       ; %bb.0:
@@ -1024,7 +1022,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs
 ; SI-NEXT:    s_and_b32 s5, s5, 0x1ffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; SI-NEXT:    s_sub_i32 s6, 0, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -1058,7 +1056,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs
 ; VI-NEXT:    s_sub_i32 s3, 0, s4
 ; VI-NEXT:    s_and_b32 s5, s2, 0xffffff
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s3, v0
@@ -1129,7 +1127,6 @@ define amdgpu_kernel void @test_no_udiv24_i32_1(ptr addrspace(1) %out, ptr addrs
   ret void
 }
 
-; RCP_IFLAG is for URECIP in the full 32b alg
 define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: test_no_udiv24_i32_2:
 ; SI:       ; %bb.0:
@@ -1143,7 +1140,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s5
 ; SI-NEXT:    s_sub_i32 s6, 0, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s6, v0
@@ -1177,7 +1174,7 @@ define amdgpu_kernel void @test_no_udiv24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; VI-NEXT:    s_sub_i32 s3, 0, s4
 ; VI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s3, v0
@@ -1267,7 +1264,7 @@ define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v3, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v3
+; SI-NEXT:    v_rcp_f32_e32 v4, v3
 ; SI-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; SI-NEXT:    v_trunc_f32_e32 v4, v4
 ; SI-NEXT:    v_fma_f32 v2, -v4, v3, v2
@@ -1295,7 +1292,7 @@ define amdgpu_kernel void @urem24_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v0
-; VI-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; VI-NEXT:    v_rcp_f32_e32 v3, v2
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_ubyte0_e32 v4, v1
 ; VI-NEXT:    v_mul_f32_e32 v3, v4, v3
@@ -1376,7 +1373,7 @@ define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; SI-NEXT:    v_rcp_iflag_f32_e32 v4, v3
+; SI-NEXT:    v_rcp_f32_e32 v4, v3
 ; SI-NEXT:    v_mul_f32_e32 v4, v2, v4
 ; SI-NEXT:    v_trunc_f32_e32 v4, v4
 ; SI-NEXT:    v_fma_f32 v2, -v4, v3, v2
@@ -1406,7 +1403,7 @@ define amdgpu_kernel void @urem24_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cvt_f32_u32_e32 v2, v0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_u32_e32 v3, v1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; VI-NEXT:    v_rcp_f32_e32 v4, v2
 ; VI-NEXT:    v_mul_f32_e32 v4, v3, v4
 ; VI-NEXT:    v_trunc_f32_e32 v4, v4
 ; VI-NEXT:    v_cvt_u32_f32_e32 v5, v4
@@ -1479,7 +1476,7 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b32 s7, s5, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -1504,7 +1501,7 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b32 s2, s4, 0xffffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
 ; VI-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1562,7 +1559,6 @@ define amdgpu_kernel void @urem24_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
   ret void
 }
 
-; RCP_IFLAG is for URECIP in the full 32b alg
 define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: urem25_i32:
 ; SI:       ; %bb.0:
@@ -1575,7 +1571,7 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b32 s4, s5, 0x1ffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    s_sub_i32 s5, 0, s4
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
@@ -1607,7 +1603,7 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_sub_i32 s3, 0, s4
 ; VI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s3, v0
@@ -1673,7 +1669,6 @@ define amdgpu_kernel void @urem25_i32(ptr addrspace(1) %out, ptr addrspace(1) %i
   ret void
 }
 
-; RCP_IFLAG is for URECIP in the full 32b alg
 define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: test_no_urem24_i32_1:
 ; SI:       ; %bb.0:
@@ -1686,7 +1681,7 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs
 ; SI-NEXT:    s_and_b32 s4, s5, 0x1ffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    s_sub_i32 s5, 0, s4
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
@@ -1718,7 +1713,7 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs
 ; VI-NEXT:    s_sub_i32 s3, 0, s4
 ; VI-NEXT:    s_and_b32 s5, s2, 0xffffff
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s3, v0
@@ -1784,7 +1779,6 @@ define amdgpu_kernel void @test_no_urem24_i32_1(ptr addrspace(1) %out, ptr addrs
   ret void
 }
 
-; RCP_IFLAG is for URECIP in the full 32b alg
 define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) {
 ; SI-LABEL: test_no_urem24_i32_2:
 ; SI:       ; %bb.0:
@@ -1797,7 +1791,7 @@ define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; SI-NEXT:    s_and_b32 s4, s5, 0xffffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    s_sub_i32 s5, 0, s4
-; SI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; SI-NEXT:    v_rcp_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_mul_lo_u32 v1, s5, v0
@@ -1829,7 +1823,7 @@ define amdgpu_kernel void @test_no_urem24_i32_2(ptr addrspace(1) %out, ptr addrs
 ; VI-NEXT:    s_sub_i32 s3, 0, s4
 ; VI-NEXT:    s_and_b32 s5, s2, 0x1ffffff
 ; VI-NEXT:    s_mov_b32 s2, -1
-; VI-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; VI-NEXT:    v_rcp_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; VI-NEXT:    v_mul_lo_u32 v1, s3, v0
@@ -1908,7 +1902,7 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr ad
 ; SI-NEXT:    s_and_b32 s5, s5, 0x7fffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -1930,7 +1924,7 @@ define amdgpu_kernel void @test_udiv24_u16_u23_i32(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
@@ -1999,7 +1993,7 @@ define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr ad
 ; SI-NEXT:    s_and_b32 s5, s5, 0xffff
 ; SI-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; SI-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; SI-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; SI-NEXT:    v_rcp_f32_e32 v2, v1
 ; SI-NEXT:    v_mul_f32_e32 v2, v0, v2
 ; SI-NEXT:    v_trunc_f32_e32 v2, v2
 ; SI-NEXT:    v_fma_f32 v0, -v2, v1, v0
@@ -2021,7 +2015,7 @@ define amdgpu_kernel void @test_udiv24_u23_u16_i32(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    s_and_b32 s2, s2, 0x7fffff
 ; VI-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; VI-NEXT:    s_mov_b32 s3, 0xf000
-; VI-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; VI-NEXT:    v_rcp_f32_e32 v2, v0
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; VI-NEXT:    v_trunc_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index bed95d73e9961..3bb489c654535 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -445,7 +445,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_lshr_b32 s8, s0, 1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -480,7 +480,7 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_lshr_b32 s8, s0, 1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-IR-NEXT:    s_sub_i32 s0, 0, s8
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s0, v0
@@ -523,9 +523,9 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-NEXT:    s_sub_i32 s1, 0, s0
 ; GCN-NEXT:    s_lshr_b32 s6, s15, 1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    s_lshr_b32 s7, s11, 1
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_lo_u32 v1, s1, v0
@@ -578,9 +578,9 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    s_sub_i32 s1, 0, s0
 ; GCN-IR-NEXT:    s_lshr_b32 s6, s15, 1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s6
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    s_lshr_b32 s7, s11, 1
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_mul_lo_u32 v1, s1, v0
@@ -641,7 +641,7 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-NEXT:    s_lshr_b32 s5, s3, 8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -666,7 +666,7 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    s_lshr_b32 s5, s3, 8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s5
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -699,9 +699,9 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-NEXT:    s_lshr_b32 s4, s15, 9
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s4
 ; GCN-NEXT:    s_lshr_b32 s5, s11, 9
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GCN-NEXT:    v_rcp_f32_e32 v3, v1
 ; GCN-NEXT:    s_sub_i32 s8, 0, s6
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -746,9 +746,9 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-IR-NEXT:    s_lshr_b32 s4, s15, 9
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, s4
 ; GCN-IR-NEXT:    s_lshr_b32 s5, s11, 9
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v2, s5
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GCN-IR-NEXT:    v_rcp_f32_e32 v3, v1
 ; GCN-IR-NEXT:    s_sub_i32 s8, 0, s6
 ; GCN-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
@@ -1391,7 +1391,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-NEXT:    s_lshr_b32 s4, s3, 8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1414,7 +1414,7 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    s_lshr_b32 s4, s3, 8
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GCN-IR-NEXT:    s_mov_b32 s3, 0xf000
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GCN-IR-NEXT:    v_rcp_f32_e32 v1, v0
 ; GCN-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v1, v1
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
@@ -1496,7 +1496,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x41c00000
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_rcp_f32_e32 v2, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1515,7 +1515,7 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x41c00000
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1539,7 +1539,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-NEXT:    s_mov_b32 s4, 0x47000000
-; GCN-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-NEXT:    v_rcp_f32_e32 v2, v1
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v2
@@ -1558,7 +1558,7 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
 ; GCN-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; GCN-IR-NEXT:    s_mov_b32 s4, 0x47000000
-; GCN-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GCN-IR-NEXT:    v_rcp_f32_e32 v2, v1
 ; GCN-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
 ; GCN-IR-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 34f4abeee405a..5c594c34f5816 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2458,7 +2458,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX1032-NEXT:    s_sub_i32 s1, 0, s0
-; GFX1032-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1032-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1032-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX1032-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1032-NEXT:    v_mul_lo_u32 v2, s1, v1
@@ -2492,7 +2492,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX1064-NEXT:    s_sub_i32 s1, 0, s0
-; GFX1064-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1064-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_mul_lo_u32 v2, s1, v1
@@ -2765,7 +2765,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) {
 ; GFX1032-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1032-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX1032-NEXT:    s_sub_i32 s1, 0, s0
-; GFX1032-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1032-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1032-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX1032-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1032-NEXT:    v_mul_lo_u32 v2, s1, v1
@@ -2799,7 +2799,7 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) {
 ; GFX1064-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s0
 ; GFX1064-NEXT:    s_sub_i32 s1, 0, s0
-; GFX1064-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX1064-NEXT:    v_rcp_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_mul_lo_u32 v2, s1, v1

From 7ae25fb1f23f846d2816478de5ba71b764f64979 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 14 May 2026 07:59:48 +0100
Subject: [PATCH 35/95] [AArch64] Keep MMO when converting gather lane to
 LDRSui. (#197522)

We were losing the MMO when converting the load. Make sure we copy them
over, which apparently alters codegen more than I expected and helps
keep postinc generation after #196305.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  14 +-
 .../complex-deinterleaving-uniform-cases.ll   | 112 +++---
 llvm/test/CodeGen/AArch64/concat-vector.ll    |  25 +-
 .../AArch64/fp-maximumnum-minimumnum.ll       |  64 ++--
 llvm/test/CodeGen/AArch64/fsh.ll              | 140 +++----
 llvm/test/CodeGen/AArch64/llvm.frexp.ll       |  14 +-
 llvm/test/CodeGen/AArch64/neon-dotreduce.ll   | 362 +++++++++---------
 .../test/CodeGen/AArch64/nontemporal-store.ll |  50 +--
 8 files changed, 396 insertions(+), 385 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 4e4c7951b0dcc..46bb9649b12d7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8260,7 +8260,8 @@ generateGatherLanePattern(MachineInstr &Root,
                 NewRegister)
             .addReg(SrcRegister)
             .addImm(Lane)
-            .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState));
+            .addReg(OffsetRegister, getKillRegState(OffsetRegisterKillState))
+            .setMemRefs(OriginalInstr->memoperands());
     InstrIdxForVirtReg.insert(std::make_pair(NewRegister, InsInstrs.size()));
     InsInstrs.push_back(LoadIndexIntoRegister);
     return NewRegister;
@@ -8268,9 +8269,9 @@ generateGatherLanePattern(MachineInstr &Root,
 
   // Helper to create load instruction based on the NumLanes in the NEON
   // register we are rewriting.
-  auto CreateLDRInstruction = [&](unsigned NumLanes, Register DestReg,
-                                  Register OffsetReg,
-                                  bool KillState) -> MachineInstrBuilder {
+  auto CreateLDRInstruction =
+      [&](unsigned NumLanes, Register DestReg, Register OffsetReg,
+          ArrayRef<MachineMemOperand *> MMOs) -> MachineInstrBuilder {
     unsigned Opcode;
     switch (NumLanes) {
     case 4:
@@ -8289,7 +8290,8 @@ generateGatherLanePattern(MachineInstr &Root,
     // Immediate offset load
     return BuildMI(MF, MIMetadata(Root), TII->get(Opcode), DestReg)
         .addReg(OffsetReg)
-        .addImm(0);
+        .addImm(0)
+        .setMemRefs(MMOs);
   };
 
   // Load the remaining lanes into register 0.
@@ -8319,7 +8321,7 @@ generateGatherLanePattern(MachineInstr &Root,
   MachineInstrBuilder MiddleIndexLoadInstr =
       CreateLDRInstruction(NumLanes, DestRegForMiddleIndex,
                            OriginalSplitToLoadOffsetOperand.getReg(),
-                           OriginalSplitToLoadOffsetOperand.isKill());
+                           OriginalSplitLoad->memoperands());
 
   InstrIdxForVirtReg.insert(
       std::make_pair(DestRegForMiddleIndex, InsInstrs.size()));
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index 09c2e481b0433..6eb7cf35c2a42 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -201,88 +201,88 @@ entry:
 define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) {
 ; CHECK-LABEL: abp90c12:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
 ; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
 ; CHECK-NEXT:    ldr s17, [sp, #32]
-; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
-; CHECK-NEXT:    add x9, sp, #48
-; CHECK-NEXT:    add x10, sp, #64
-; CHECK-NEXT:    mov v1.s[1], v3.s[0]
-; CHECK-NEXT:    mov v0.s[1], v2.s[0]
 ; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
-; CHECK-NEXT:    add x11, sp, #72
-; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-NEXT:    add x10, sp, #48
+; CHECK-NEXT:    add x11, sp, #64
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
+; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    ldr s3, [sp, #96]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    ldr s16, [sp, #8]
 ; CHECK-NEXT:    ldr s18, [x10]
-; CHECK-NEXT:    add x9, sp, #80
 ; CHECK-NEXT:    add x10, sp, #56
-; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
 ; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
-; CHECK-NEXT:    ldr s16, [sp, #8]
-; CHECK-NEXT:    ldr s3, [sp, #96]
-; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #88
 ; CHECK-NEXT:    ldr s2, [sp]
+; CHECK-NEXT:    add x9, sp, #16
+; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    ldr s4, [x11]
 ; CHECK-NEXT:    mov v1.s[2], v5.s[0]
+; CHECK-NEXT:    add x11, sp, #80
 ; CHECK-NEXT:    ldr s5, [sp, #40]
-; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #88
+; CHECK-NEXT:    add x9, sp, #168
 ; CHECK-NEXT:    ld1 { v5.s }[1], [x10]
-; CHECK-NEXT:    ldr s19, [x11]
-; CHECK-NEXT:    add x10, sp, #144
-; CHECK-NEXT:    zip1 v4.2d, v17.2d, v18.2d
-; CHECK-NEXT:    add x11, sp, #160
-; CHECK-NEXT:    ldr s18, [sp, #136]
-; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v18.s }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #112
 ; CHECK-NEXT:    mov v0.s[3], v6.s[0]
-; CHECK-NEXT:    ldr s6, [sp, #128]
 ; CHECK-NEXT:    mov v1.s[3], v7.s[0]
-; CHECK-NEXT:    add x9, sp, #24
-; CHECK-NEXT:    ldr s7, [sp, #104]
-; CHECK-NEXT:    ld1 { v16.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    ld1 { v6.s }[1], [x10]
-; CHECK-NEXT:    zip1 v5.2d, v5.2d, v19.2d
-; CHECK-NEXT:    add x10, sp, #120
-; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
+; CHECK-NEXT:    add x10, sp, #24
+; CHECK-NEXT:    ld1 { v3.s }[1], [x11]
+; CHECK-NEXT:    zip1 v4.2d, v17.2d, v4.2d
+; CHECK-NEXT:    add x11, sp, #120
+; CHECK-NEXT:    zip1 v6.2d, v5.2d, v18.2d
+; CHECK-NEXT:    ldr s5, [sp, #104]
+; CHECK-NEXT:    ld1 { v16.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    ldr s7, [sp, #128]
+; CHECK-NEXT:    ldr s18, [sp, #192]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x11]
+; CHECK-NEXT:    ldr s17, [x10]
+; CHECK-NEXT:    add x10, sp, #144
+; CHECK-NEXT:    add x11, sp, #176
 ; CHECK-NEXT:    ld1 { v7.s }[1], [x10]
-; CHECK-NEXT:    ldr s17, [x11]
-; CHECK-NEXT:    add x9, sp, #176
-; CHECK-NEXT:    add x10, sp, #16
-; CHECK-NEXT:    add x11, sp, #168
-; CHECK-NEXT:    ld1 { v17.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
-; CHECK-NEXT:    add x9, sp, #152
-; CHECK-NEXT:    fmul v19.4s, v5.4s, v1.4s
-; CHECK-NEXT:    fmul v20.4s, v7.4s, v16.4s
-; CHECK-NEXT:    fmul v16.4s, v3.4s, v16.4s
+; CHECK-NEXT:    ldr s21, [x9]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x11]
+; CHECK-NEXT:    fmul v19.4s, v6.4s, v1.4s
 ; CHECK-NEXT:    fmul v1.4s, v4.4s, v1.4s
+; CHECK-NEXT:    fmul v20.4s, v5.4s, v16.4s
+; CHECK-NEXT:    fmul v16.4s, v3.4s, v16.4s
+; CHECK-NEXT:    add x9, sp, #208
+; CHECK-NEXT:    add x10, sp, #152
+; CHECK-NEXT:    add x11, sp, #184
 ; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
-; CHECK-NEXT:    ldr s21, [x11]
-; CHECK-NEXT:    zip1 v6.2d, v6.2d, v17.2d
-; CHECK-NEXT:    ldr s17, [sp, #192]
-; CHECK-NEXT:    add x9, sp, #184
-; CHECK-NEXT:    add x10, sp, #208
-; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
-; CHECK-NEXT:    add x9, sp, #216
+; CHECK-NEXT:    zip1 v7.2d, v7.2d, v17.2d
+; CHECK-NEXT:    ldr s17, [sp, #136]
+; CHECK-NEXT:    ld1 { v21.s }[1], [x11]
 ; CHECK-NEXT:    fneg v19.4s, v19.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v6.4s
+; CHECK-NEXT:    add x9, sp, #216
 ; CHECK-NEXT:    fneg v20.4s, v20.4s
-; CHECK-NEXT:    fmla v16.4s, v2.4s, v7.4s
-; CHECK-NEXT:    fmla v1.4s, v0.4s, v5.4s
+; CHECK-NEXT:    fmla v16.4s, v2.4s, v5.4s
 ; CHECK-NEXT:    ld1 { v17.s }[1], [x10]
 ; CHECK-NEXT:    ldr s5, [sp, #200]
-; CHECK-NEXT:    zip1 v7.2d, v18.2d, v21.2d
-; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    zip1 v6.2d, v17.2d, v21.2d
 ; CHECK-NEXT:    fmla v19.4s, v0.4s, v4.4s
+; CHECK-NEXT:    fsub v0.4s, v7.4s, v1.4s
 ; CHECK-NEXT:    fmla v20.4s, v2.4s, v3.4s
-; CHECK-NEXT:    fsub v0.4s, v6.4s, v1.4s
-; CHECK-NEXT:    fsub v1.4s, v17.4s, v16.4s
-; CHECK-NEXT:    fadd v2.4s, v7.4s, v19.4s
+; CHECK-NEXT:    fsub v1.4s, v18.4s, v16.4s
+; CHECK-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-NEXT:    fadd v2.4s, v6.4s, v19.4s
 ; CHECK-NEXT:    fadd v3.4s, v5.4s, v20.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
 ; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
-; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
 ; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
 ; CHECK-NEXT:    rev64 v4.4s, v4.4s
 ; CHECK-NEXT:    trn2 v3.4s, v4.4s, v5.4s
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 385ec6710185b..884f786d1b973 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -180,14 +180,23 @@ define <16 x i8> @concat_v16s8_v4s8(ptr %ptr) {
 }
 
 define <16 x i8> @concat_v16s8_v4s8_load(ptr %ptrA, ptr %ptrB, ptr %ptrC, ptr %ptrD) {
-; CHECK-LABEL: concat_v16s8_v4s8_load:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x1]
-; CHECK-NEXT:    ldr s1, [x2]
-; CHECK-NEXT:    ld1 { v1.s }[1], [x3]
-; CHECK-NEXT:    zip1 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: concat_v16s8_v4s8_load:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr s0, [x2]
+; CHECK-SD-NEXT:    ldr s1, [x0]
+; CHECK-SD-NEXT:    ld1 { v0.s }[1], [x3]
+; CHECK-SD-NEXT:    ld1 { v1.s }[1], [x1]
+; CHECK-SD-NEXT:    zip1 v0.2d, v1.2d, v0.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: concat_v16s8_v4s8_load:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr s0, [x0]
+; CHECK-GI-NEXT:    ldr s1, [x2]
+; CHECK-GI-NEXT:    ld1 { v0.s }[1], [x1]
+; CHECK-GI-NEXT:    ld1 { v1.s }[1], [x3]
+; CHECK-GI-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    ret
     %A = load <4 x i8>, ptr %ptrA
     %B = load <4 x i8>, ptr %ptrB
     %C = load <4 x i8>, ptr %ptrC
diff --git a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
index a3a09839c54c4..a457fe01d309d 100644
--- a/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
+++ b/llvm/test/CodeGen/AArch64/fp-maximumnum-minimumnum.ll
@@ -1683,42 +1683,42 @@ define <9 x half> @max_v9f16(<9 x half> %a, <9 x half> %b) {
 ; CHECK-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-FP16-NEXT:    // kill: def $h1 killed $h1 def $q1
 ; CHECK-FP16-NEXT:    // kill: def $h2 killed $h2 def $q2
-; CHECK-FP16-NEXT:    add x9, sp, #16
 ; CHECK-FP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP16-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP16-NEXT:    add x10, sp, #40
+; CHECK-FP16-NEXT:    add x9, sp, #40
+; CHECK-FP16-NEXT:    add x10, sp, #48
 ; CHECK-FP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; CHECK-FP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP16-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-FP16-NEXT:    ldr h1, [sp, #8]
+; CHECK-FP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-FP16-NEXT:    ldr h2, [x9]
+; CHECK-FP16-NEXT:    add x9, sp, #16
 ; CHECK-FP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-FP16-NEXT:    add x9, sp, #24
-; CHECK-FP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-FP16-NEXT:    ld1 { v2.h }[1], [x10]
+; CHECK-FP16-NEXT:    add x10, sp, #56
+; CHECK-FP16-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; CHECK-FP16-NEXT:    add x9, sp, #32
-; CHECK-FP16-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-FP16-NEXT:    ld1 { v1.h }[3], [x9]
-; CHECK-FP16-NEXT:    ldr h2, [x10]
-; CHECK-FP16-NEXT:    add x9, sp, #48
+; CHECK-FP16-NEXT:    ld1 { v2.h }[2], [x10]
+; CHECK-FP16-NEXT:    add x10, sp, #64
 ; CHECK-FP16-NEXT:    ldr h3, [sp, #72]
-; CHECK-FP16-NEXT:    ld1 { v2.h }[1], [x9]
-; CHECK-FP16-NEXT:    add x9, sp, #56
+; CHECK-FP16-NEXT:    ld1 { v1.h }[3], [x9]
 ; CHECK-FP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; CHECK-FP16-NEXT:    mov v0.h[4], v4.h[0]
-; CHECK-FP16-NEXT:    ld1 { v2.h }[2], [x9]
-; CHECK-FP16-NEXT:    add x9, sp, #64
-; CHECK-FP16-NEXT:    mov v0.h[5], v5.h[0]
-; CHECK-FP16-NEXT:    ld1 { v2.h }[3], [x9]
+; CHECK-FP16-NEXT:    ld1 { v2.h }[3], [x10]
 ; CHECK-FP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
 ; CHECK-FP16-NEXT:    ldr h2, [sp]
-; CHECK-FP16-NEXT:    mov v0.h[6], v6.h[0]
+; CHECK-FP16-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; CHECK-FP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
-; CHECK-FP16-NEXT:    mov v0.h[7], v7.h[0]
+; CHECK-FP16-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP16-NEXT:    fmaxnm v2.8h, v2.8h, v3.8h
-; CHECK-FP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-FP16-NEXT:    mov v0.h[7], v7.h[0]
 ; CHECK-FP16-NEXT:    str h2, [x8, #16]
+; CHECK-FP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    fmaxnm v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    str q0, [x8]
 ; CHECK-FP16-NEXT:    ret
@@ -2326,42 +2326,42 @@ define <9 x half> @min_v9f16(<9 x half> %a, <9 x half> %b) {
 ; CHECK-FP16-NEXT:    // kill: def $h0 killed $h0 def $q0
 ; CHECK-FP16-NEXT:    // kill: def $h1 killed $h1 def $q1
 ; CHECK-FP16-NEXT:    // kill: def $h2 killed $h2 def $q2
-; CHECK-FP16-NEXT:    add x9, sp, #16
 ; CHECK-FP16-NEXT:    // kill: def $h3 killed $h3 def $q3
 ; CHECK-FP16-NEXT:    // kill: def $h4 killed $h4 def $q4
-; CHECK-FP16-NEXT:    add x10, sp, #40
+; CHECK-FP16-NEXT:    add x9, sp, #40
+; CHECK-FP16-NEXT:    add x10, sp, #48
 ; CHECK-FP16-NEXT:    // kill: def $h5 killed $h5 def $q5
 ; CHECK-FP16-NEXT:    // kill: def $h6 killed $h6 def $q6
 ; CHECK-FP16-NEXT:    // kill: def $h7 killed $h7 def $q7
 ; CHECK-FP16-NEXT:    mov v0.h[1], v1.h[0]
 ; CHECK-FP16-NEXT:    ldr h1, [sp, #8]
+; CHECK-FP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-FP16-NEXT:    ldr h2, [x9]
+; CHECK-FP16-NEXT:    add x9, sp, #16
 ; CHECK-FP16-NEXT:    ld1 { v1.h }[1], [x9]
 ; CHECK-FP16-NEXT:    add x9, sp, #24
-; CHECK-FP16-NEXT:    mov v0.h[2], v2.h[0]
+; CHECK-FP16-NEXT:    ld1 { v2.h }[1], [x10]
+; CHECK-FP16-NEXT:    add x10, sp, #56
+; CHECK-FP16-NEXT:    mov v0.h[3], v3.h[0]
 ; CHECK-FP16-NEXT:    ld1 { v1.h }[2], [x9]
 ; CHECK-FP16-NEXT:    add x9, sp, #32
-; CHECK-FP16-NEXT:    mov v0.h[3], v3.h[0]
-; CHECK-FP16-NEXT:    ld1 { v1.h }[3], [x9]
-; CHECK-FP16-NEXT:    ldr h2, [x10]
-; CHECK-FP16-NEXT:    add x9, sp, #48
+; CHECK-FP16-NEXT:    ld1 { v2.h }[2], [x10]
+; CHECK-FP16-NEXT:    add x10, sp, #64
 ; CHECK-FP16-NEXT:    ldr h3, [sp, #72]
-; CHECK-FP16-NEXT:    ld1 { v2.h }[1], [x9]
-; CHECK-FP16-NEXT:    add x9, sp, #56
+; CHECK-FP16-NEXT:    ld1 { v1.h }[3], [x9]
 ; CHECK-FP16-NEXT:    fminnm v3.8h, v3.8h, v3.8h
 ; CHECK-FP16-NEXT:    mov v0.h[4], v4.h[0]
-; CHECK-FP16-NEXT:    ld1 { v2.h }[2], [x9]
-; CHECK-FP16-NEXT:    add x9, sp, #64
-; CHECK-FP16-NEXT:    mov v0.h[5], v5.h[0]
-; CHECK-FP16-NEXT:    ld1 { v2.h }[3], [x9]
+; CHECK-FP16-NEXT:    ld1 { v2.h }[3], [x10]
 ; CHECK-FP16-NEXT:    zip1 v1.2d, v1.2d, v2.2d
 ; CHECK-FP16-NEXT:    ldr h2, [sp]
-; CHECK-FP16-NEXT:    mov v0.h[6], v6.h[0]
+; CHECK-FP16-NEXT:    mov v0.h[5], v5.h[0]
 ; CHECK-FP16-NEXT:    fminnm v2.8h, v2.8h, v2.8h
 ; CHECK-FP16-NEXT:    fminnm v1.8h, v1.8h, v1.8h
-; CHECK-FP16-NEXT:    mov v0.h[7], v7.h[0]
+; CHECK-FP16-NEXT:    mov v0.h[6], v6.h[0]
 ; CHECK-FP16-NEXT:    fminnm v2.8h, v2.8h, v3.8h
-; CHECK-FP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
+; CHECK-FP16-NEXT:    mov v0.h[7], v7.h[0]
 ; CHECK-FP16-NEXT:    str h2, [x8, #16]
+; CHECK-FP16-NEXT:    fminnm v0.8h, v0.8h, v0.8h
 ; CHECK-FP16-NEXT:    fminnm v0.8h, v0.8h, v1.8h
 ; CHECK-FP16-NEXT:    str q0, [x8]
 ; CHECK-FP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 89109ba653f57..6a36a0f3fe5fd 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -2396,64 +2396,64 @@ define <7 x i32> @fshl_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
 ;
 ; CHECK-GI-LABEL: fshl_v7i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    ldr s5, [sp, #80]
-; CHECK-GI-NEXT:    ldr s16, [sp, #88]
-; CHECK-GI-NEXT:    fmov s19, w0
 ; CHECK-GI-NEXT:    ldr s1, [sp, #48]
 ; CHECK-GI-NEXT:    ldr s7, [sp, #56]
+; CHECK-GI-NEXT:    add x9, sp, #64
+; CHECK-GI-NEXT:    ldr s5, [sp, #80]
+; CHECK-GI-NEXT:    ldr s16, [sp, #88]
 ; CHECK-GI-NEXT:    add x8, sp, #56
+; CHECK-GI-NEXT:    mov v1.s[1], v7.s[0]
+; CHECK-GI-NEXT:    ldr s6, [sp]
+; CHECK-GI-NEXT:    ldr s7, [sp, #64]
 ; CHECK-GI-NEXT:    mov v5.s[1], v16.s[0]
 ; CHECK-GI-NEXT:    fmov s16, w7
-; CHECK-GI-NEXT:    ldr s6, [sp]
-; CHECK-GI-NEXT:    mov v1.s[1], v7.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #64]
-; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    ldr s17, [sp, #48]
+; CHECK-GI-NEXT:    ldr s18, [x9]
+; CHECK-GI-NEXT:    add x10, sp, #72
+; CHECK-GI-NEXT:    ldr s19, [sp, #96]
+; CHECK-GI-NEXT:    ld1 { v17.s }[1], [x8]
 ; CHECK-GI-NEXT:    ldr s4, [sp, #8]
 ; CHECK-GI-NEXT:    ldr s0, [sp, #24]
-; CHECK-GI-NEXT:    ldr s3, [sp, #32]
 ; CHECK-GI-NEXT:    mov v16.s[1], v6.s[0]
-; CHECK-GI-NEXT:    ldr s6, [sp, #96]
-; CHECK-GI-NEXT:    add x9, sp, #64
-; CHECK-GI-NEXT:    ld1 { v7.s }[1], [x8]
-; CHECK-GI-NEXT:    ldr s18, [x9]
-; CHECK-GI-NEXT:    mov v19.s[1], w1
-; CHECK-GI-NEXT:    mov v5.s[2], v6.s[0]
-; CHECK-GI-NEXT:    movi v6.2d, #0xffffffffffffffff
-; CHECK-GI-NEXT:    mov v1.s[2], v17.s[0]
-; CHECK-GI-NEXT:    ldr s17, [sp, #72]
-; CHECK-GI-NEXT:    add x8, sp, #72
+; CHECK-GI-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-NEXT:    fmov s7, w0
+; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x10]
+; CHECK-GI-NEXT:    ldr s3, [sp, #32]
+; CHECK-GI-NEXT:    ldr s6, [sp, #72]
+; CHECK-GI-NEXT:    mov v5.s[2], v19.s[0]
+; CHECK-GI-NEXT:    movi v19.2d, #0xffffffffffffffff
 ; CHECK-GI-NEXT:    ldr s20, [sp, #80]
-; CHECK-GI-NEXT:    mov v16.s[2], v4.s[0]
+; CHECK-GI-NEXT:    mov v7.s[1], w1
 ; CHECK-GI-NEXT:    mov v0.s[1], v3.s[0]
-; CHECK-GI-NEXT:    ld1 { v18.s }[1], [x8]
 ; CHECK-GI-NEXT:    add x8, sp, #88
+; CHECK-GI-NEXT:    mov v16.s[2], v4.s[0]
+; CHECK-GI-NEXT:    mov v1.s[3], v6.s[0]
+; CHECK-GI-NEXT:    zip1 v6.2d, v17.2d, v18.2d
+; CHECK-GI-NEXT:    fmov s17, w4
 ; CHECK-GI-NEXT:    movi v4.4s, #31
 ; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    eor v5.16b, v5.16b, v6.16b
-; CHECK-GI-NEXT:    fmov s6, w4
-; CHECK-GI-NEXT:    mov v1.s[3], v17.s[0]
 ; CHECK-GI-NEXT:    ldr s3, [sp, #40]
 ; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x8]
-; CHECK-GI-NEXT:    mov v19.s[2], w2
-; CHECK-GI-NEXT:    zip1 v7.2d, v7.2d, v18.2d
-; CHECK-GI-NEXT:    mov v16.s[3], v2.s[0]
+; CHECK-GI-NEXT:    eor v5.16b, v5.16b, v19.16b
+; CHECK-GI-NEXT:    mov v7.s[2], w2
 ; CHECK-GI-NEXT:    add x8, sp, #96
-; CHECK-GI-NEXT:    mov v6.s[1], w5
+; CHECK-GI-NEXT:    mov v17.s[1], w5
+; CHECK-GI-NEXT:    mov v16.s[3], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v3.s[0]
-; CHECK-GI-NEXT:    and v2.16b, v5.16b, v4.16b
 ; CHECK-GI-NEXT:    bic v1.16b, v4.16b, v1.16b
 ; CHECK-GI-NEXT:    ld1 { v20.s }[2], [x8]
-; CHECK-GI-NEXT:    mov v19.s[3], w3
-; CHECK-GI-NEXT:    and v3.16b, v7.16b, v4.16b
+; CHECK-GI-NEXT:    and v2.16b, v5.16b, v4.16b
+; CHECK-GI-NEXT:    and v3.16b, v6.16b, v4.16b
+; CHECK-GI-NEXT:    mov v7.s[3], w3
+; CHECK-GI-NEXT:    mov v17.s[2], w6
 ; CHECK-GI-NEXT:    ushr v5.4s, v16.4s, #1
-; CHECK-GI-NEXT:    neg v2.4s, v2.4s
-; CHECK-GI-NEXT:    mov v6.s[2], w6
 ; CHECK-GI-NEXT:    neg v1.4s, v1.4s
 ; CHECK-GI-NEXT:    and v4.16b, v20.16b, v4.16b
 ; CHECK-GI-NEXT:    ushr v0.4s, v0.4s, #1
-; CHECK-GI-NEXT:    ushl v3.4s, v19.4s, v3.4s
+; CHECK-GI-NEXT:    neg v2.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v3.4s, v7.4s, v3.4s
 ; CHECK-GI-NEXT:    ushl v1.4s, v5.4s, v1.4s
-; CHECK-GI-NEXT:    ushl v4.4s, v6.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v4.4s, v17.4s, v4.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    orr v1.16b, v3.16b, v1.16b
 ; CHECK-GI-NEXT:    orr v0.16b, v4.16b, v0.16b
@@ -2536,62 +2536,62 @@ define <7 x i32> @fshr_v7i32(<7 x i32> %a, <7 x i32> %b, <7 x i32> %c) {
 ; CHECK-GI-LABEL: fshr_v7i32:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ldr s1, [sp, #48]
-; CHECK-GI-NEXT:    ldr s16, [sp, #56]
-; CHECK-GI-NEXT:    add x8, sp, #56
-; CHECK-GI-NEXT:    ldr s17, [sp, #80]
-; CHECK-GI-NEXT:    ldr s18, [sp, #88]
+; CHECK-GI-NEXT:    ldr s17, [sp, #56]
 ; CHECK-GI-NEXT:    add x9, sp, #64
-; CHECK-GI-NEXT:    mov v1.s[1], v16.s[0]
-; CHECK-GI-NEXT:    fmov s16, w0
-; CHECK-GI-NEXT:    ldr s5, [sp, #48]
-; CHECK-GI-NEXT:    mov v17.s[1], v18.s[0]
-; CHECK-GI-NEXT:    fmov s18, w7
+; CHECK-GI-NEXT:    ldr s18, [sp, #80]
+; CHECK-GI-NEXT:    ldr s19, [sp, #88]
+; CHECK-GI-NEXT:    add x8, sp, #56
+; CHECK-GI-NEXT:    mov v1.s[1], v17.s[0]
+; CHECK-GI-NEXT:    fmov s17, w0
 ; CHECK-GI-NEXT:    ldr s4, [sp]
-; CHECK-GI-NEXT:    ld1 { v5.s }[1], [x8]
-; CHECK-GI-NEXT:    ldr s6, [x9]
+; CHECK-GI-NEXT:    mov v18.s[1], v19.s[0]
+; CHECK-GI-NEXT:    fmov s19, w7
+; CHECK-GI-NEXT:    ldr s7, [sp, #48]
+; CHECK-GI-NEXT:    ldr s16, [x9]
 ; CHECK-GI-NEXT:    add x10, sp, #72
-; CHECK-GI-NEXT:    mov v16.s[1], w1
 ; CHECK-GI-NEXT:    ldr s20, [sp, #80]
-; CHECK-GI-NEXT:    ldr s7, [sp, #64]
-; CHECK-GI-NEXT:    mov v18.s[1], v4.s[0]
+; CHECK-GI-NEXT:    mov v17.s[1], w1
+; CHECK-GI-NEXT:    ldr s6, [sp, #64]
+; CHECK-GI-NEXT:    add x9, sp, #88
+; CHECK-GI-NEXT:    mov v19.s[1], v4.s[0]
 ; CHECK-GI-NEXT:    fmov s4, w4
-; CHECK-GI-NEXT:    add x8, sp, #88
-; CHECK-GI-NEXT:    ld1 { v6.s }[1], [x10]
+; CHECK-GI-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-GI-NEXT:    ld1 { v16.s }[1], [x10]
 ; CHECK-GI-NEXT:    ldr s21, [sp, #96]
-; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x8]
-; CHECK-GI-NEXT:    mov v1.s[2], v7.s[0]
+; CHECK-GI-NEXT:    ld1 { v20.s }[1], [x9]
+; CHECK-GI-NEXT:    mov v1.s[2], v6.s[0]
 ; CHECK-GI-NEXT:    ldr s3, [sp, #8]
 ; CHECK-GI-NEXT:    ldr s0, [sp, #24]
-; CHECK-GI-NEXT:    mov v16.s[2], w2
+; CHECK-GI-NEXT:    mov v17.s[2], w2
 ; CHECK-GI-NEXT:    mov v4.s[1], w5
 ; CHECK-GI-NEXT:    ldr s2, [sp, #32]
-; CHECK-GI-NEXT:    zip1 v5.2d, v5.2d, v6.2d
-; CHECK-GI-NEXT:    movi v6.4s, #31
+; CHECK-GI-NEXT:    zip1 v6.2d, v7.2d, v16.2d
+; CHECK-GI-NEXT:    movi v7.4s, #31
 ; CHECK-GI-NEXT:    add x8, sp, #96
-; CHECK-GI-NEXT:    mov v17.s[2], v21.s[0]
-; CHECK-GI-NEXT:    movi v7.2d, #0xffffffffffffffff
-; CHECK-GI-NEXT:    ldr s19, [sp, #72]
+; CHECK-GI-NEXT:    mov v18.s[2], v21.s[0]
+; CHECK-GI-NEXT:    movi v16.2d, #0xffffffffffffffff
+; CHECK-GI-NEXT:    ldr s5, [sp, #72]
 ; CHECK-GI-NEXT:    ld1 { v20.s }[2], [x8]
-; CHECK-GI-NEXT:    mov v18.s[2], v3.s[0]
+; CHECK-GI-NEXT:    mov v19.s[2], v3.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v2.s[0]
-; CHECK-GI-NEXT:    mov v1.s[3], v19.s[0]
-; CHECK-GI-NEXT:    mov v16.s[3], w3
+; CHECK-GI-NEXT:    mov v1.s[3], v5.s[0]
+; CHECK-GI-NEXT:    mov v17.s[3], w3
 ; CHECK-GI-NEXT:    mov v4.s[2], w6
 ; CHECK-GI-NEXT:    ldr s2, [sp, #16]
-; CHECK-GI-NEXT:    and v3.16b, v5.16b, v6.16b
+; CHECK-GI-NEXT:    and v3.16b, v6.16b, v7.16b
 ; CHECK-GI-NEXT:    ldr s5, [sp, #40]
-; CHECK-GI-NEXT:    and v19.16b, v20.16b, v6.16b
-; CHECK-GI-NEXT:    eor v7.16b, v17.16b, v7.16b
-; CHECK-GI-NEXT:    mov v18.s[3], v2.s[0]
+; CHECK-GI-NEXT:    and v6.16b, v20.16b, v7.16b
+; CHECK-GI-NEXT:    eor v16.16b, v18.16b, v16.16b
+; CHECK-GI-NEXT:    mov v19.s[3], v2.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[2], v5.s[0]
-; CHECK-GI-NEXT:    bic v1.16b, v6.16b, v1.16b
-; CHECK-GI-NEXT:    shl v2.4s, v16.4s, #1
+; CHECK-GI-NEXT:    bic v1.16b, v7.16b, v1.16b
+; CHECK-GI-NEXT:    shl v2.4s, v17.4s, #1
 ; CHECK-GI-NEXT:    neg v3.4s, v3.4s
-; CHECK-GI-NEXT:    and v5.16b, v7.16b, v6.16b
+; CHECK-GI-NEXT:    and v5.16b, v16.16b, v7.16b
 ; CHECK-GI-NEXT:    shl v4.4s, v4.4s, #1
-; CHECK-GI-NEXT:    neg v6.4s, v19.4s
+; CHECK-GI-NEXT:    neg v6.4s, v6.4s
 ; CHECK-GI-NEXT:    ushl v1.4s, v2.4s, v1.4s
-; CHECK-GI-NEXT:    ushl v2.4s, v18.4s, v3.4s
+; CHECK-GI-NEXT:    ushl v2.4s, v19.4s, v3.4s
 ; CHECK-GI-NEXT:    ushl v3.4s, v4.4s, v5.4s
 ; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v6.4s
 ; CHECK-GI-NEXT:    orr v1.16b, v1.16b, v2.16b
diff --git a/llvm/test/CodeGen/AArch64/llvm.frexp.ll b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
index 7fd4246cd4975..12534a1c0114a 100644
--- a/llvm/test/CodeGen/AArch64/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.frexp.ll
@@ -697,17 +697,17 @@ define { <4 x float>, <4 x i32> } @test_frexp_v4f32_v4i32(<4 x float> %a) nounwi
 ; CHECK-NEXT:    mov s0, v0.s[3]
 ; CHECK-NEXT:    str q1, [sp] // 16-byte Spill
 ; CHECK-NEXT:    bl frexpf
+; CHECK-NEXT:    ldr q3, [sp] // 16-byte Reload
 ; CHECK-NEXT:    ldr s1, [sp, #44]
-; CHECK-NEXT:    ldr q2, [sp] // 16-byte Reload
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT:    mov v2.s[3], v0.s[0]
+; CHECK-NEXT:    ldr s2, [x20]
+; CHECK-NEXT:    mov v3.s[3], v0.s[0]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x19]
-; CHECK-NEXT:    ldr s0, [x20]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x21]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #64] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x21, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    zip1 v1.2d, v1.2d, v0.2d
-; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    zip1 v1.2d, v1.2d, v2.2d
+; CHECK-NEXT:    mov v0.16b, v3.16b
 ; CHECK-NEXT:    add sp, sp, #80
 ; CHECK-NEXT:    ret
 ;
@@ -872,8 +872,8 @@ define <4 x i32> @test_frexp_v4f32_v4i32_only_use_exp(<4 x float> %a) nounwind {
 ; CHECK-NEXT:    mov s0, v0.s[3]
 ; CHECK-NEXT:    bl frexpf
 ; CHECK-NEXT:    ldr s0, [sp, #28]
-; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
 ; CHECK-NEXT:    ldr s1, [x20]
+; CHECK-NEXT:    ld1 { v0.s }[1], [x19]
 ; CHECK-NEXT:    ld1 { v1.s }[1], [x21]
 ; CHECK-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x30, x21, [sp, #32] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
index ff9c75cfd0c5e..e176cd3233d69 100644
--- a/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
+++ b/llvm/test/CodeGen/AArch64/neon-dotreduce.ll
@@ -8048,200 +8048,200 @@ define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %
 ; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
 ; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-SD-NEXT:    .cfi_offset w29, -16
-; CHECK-SD-NEXT:    ldr b0, [sp, #208]
+; CHECK-SD-NEXT:    add x8, sp, #272
+; CHECK-SD-NEXT:    ldr b4, [sp, #208]
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    ldr b5, [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #216
-; CHECK-SD-NEXT:    add x9, sp, #272
-; CHECK-SD-NEXT:    ldr b2, [sp, #80]
-; CHECK-SD-NEXT:    ldr b4, [sp, #976]
-; CHECK-SD-NEXT:    ldr b6, [sp, #720]
-; CHECK-SD-NEXT:    ld1 { v0.b }[1], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #280
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #224
-; CHECK-SD-NEXT:    fmov s16, w0
-; CHECK-SD-NEXT:    ldr b17, [sp, #848]
-; CHECK-SD-NEXT:    add x10, sp, #24
-; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v0.b }[2], [x8]
+; CHECK-SD-NEXT:    add x12, sp, #256
+; CHECK-SD-NEXT:    mov v0.b[1], w1
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #288
+; CHECK-SD-NEXT:    ldr b6, [sp, #976]
+; CHECK-SD-NEXT:    add x13, sp, #984
+; CHECK-SD-NEXT:    add x10, sp, #264
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #232
-; CHECK-SD-NEXT:    mov v16.b[1], w1
-; CHECK-SD-NEXT:    ld1 { v0.b }[3], [x8]
+; CHECK-SD-NEXT:    ldr b7, [sp, #720]
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #296
+; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x13]
+; CHECK-SD-NEXT:    mov v0.b[2], w2
+; CHECK-SD-NEXT:    add x13, sp, #784
+; CHECK-SD-NEXT:    add x11, sp, #328
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
 ; CHECK-SD-NEXT:    add x8, sp, #240
-; CHECK-SD-NEXT:    mov v16.b[2], w2
-; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #248
-; CHECK-SD-NEXT:    mov v16.b[3], w3
-; CHECK-SD-NEXT:    ld1 { v0.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #256
-; CHECK-SD-NEXT:    ld1 { v0.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #264
-; CHECK-SD-NEXT:    mov v16.b[4], w4
-; CHECK-SD-NEXT:    ld1 { v0.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b1, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #280
-; CHECK-SD-NEXT:    add x9, sp, #88
-; CHECK-SD-NEXT:    mov v16.b[5], w5
-; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #288
-; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #296
-; CHECK-SD-NEXT:    mov v16.b[6], w6
-; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #304
-; CHECK-SD-NEXT:    mov v16.b[7], w7
-; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #312
-; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #320
-; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #328
-; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #96
-; CHECK-SD-NEXT:    add x9, sp, #144
-; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #104
-; CHECK-SD-NEXT:    zip1 v0.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ldr b17, [x13]
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #304
+; CHECK-SD-NEXT:    add x13, sp, #1064
+; CHECK-SD-NEXT:    add x14, sp, #1080
 ; CHECK-SD-NEXT:    movi v1.16b, #1
-; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #112
-; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #120
-; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #128
-; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #136
-; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b3, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #152
-; CHECK-SD-NEXT:    add x9, sp, #984
-; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #160
-; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #168
-; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #176
-; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #184
-; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #192
-; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #200
-; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #992
-; CHECK-SD-NEXT:    add x9, sp, #1040
-; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1000
-; CHECK-SD-NEXT:    zip1 v2.2d, v2.2d, v3.2d
-; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1008
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
 ; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1016
+; CHECK-SD-NEXT:    add x8, sp, #248
+; CHECK-SD-NEXT:    mov v0.b[3], w3
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #312
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
 ; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1024
-; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1032
-; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b5, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #1048
-; CHECK-SD-NEXT:    add x9, sp, #728
-; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1056
-; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1064
-; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1072
-; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1080
-; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1088
+; CHECK-SD-NEXT:    add x8, sp, #320
+; CHECK-SD-NEXT:    mov v0.b[4], w4
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #1040
+; CHECK-SD-NEXT:    ldr b16, [x12]
+; CHECK-SD-NEXT:    add x12, sp, #1048
 ; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #1096
-; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #736
-; CHECK-SD-NEXT:    add x9, sp, #784
-; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #744
-; CHECK-SD-NEXT:    zip1 v4.2d, v4.2d, v5.2d
-; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
-; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #752
-; CHECK-SD-NEXT:    sdot v19.4s, v4.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v5.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #760
-; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #768
-; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #776
+; CHECK-SD-NEXT:    mov v0.b[5], w5
+; CHECK-SD-NEXT:    add x8, sp, #24
+; CHECK-SD-NEXT:    ld1 { v16.b }[1], [x12]
+; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #992
+; CHECK-SD-NEXT:    add x12, sp, #1056
+; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #728
+; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x10]
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x11]
+; CHECK-SD-NEXT:    add x11, sp, #1000
+; CHECK-SD-NEXT:    ld1 { v16.b }[2], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #792
+; CHECK-SD-NEXT:    mov v0.b[6], w6
+; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x12]
+; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x11]
+; CHECK-SD-NEXT:    add x12, sp, #736
+; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x12]
+; CHECK-SD-NEXT:    add x10, sp, #1008
+; CHECK-SD-NEXT:    add x11, sp, #1072
+; CHECK-SD-NEXT:    ld1 { v16.b }[3], [x13]
+; CHECK-SD-NEXT:    add x13, sp, #800
+; CHECK-SD-NEXT:    zip1 v5.2d, v4.2d, v5.2d
+; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x13]
+; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x10]
+; CHECK-SD-NEXT:    add x13, sp, #808
+; CHECK-SD-NEXT:    mov v0.b[7], w7
+; CHECK-SD-NEXT:    add x10, sp, #1016
+; CHECK-SD-NEXT:    add x12, sp, #32
+; CHECK-SD-NEXT:    ld1 { v16.b }[4], [x11]
+; CHECK-SD-NEXT:    add x11, sp, #744
+; CHECK-SD-NEXT:    ldr b4, [sp, #80]
+; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x11]
+; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x13]
+; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #752
+; CHECK-SD-NEXT:    add x11, sp, #816
+; CHECK-SD-NEXT:    add x13, sp, #1088
+; CHECK-SD-NEXT:    ld1 { v16.b }[5], [x14]
+; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #1024
+; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x10]
+; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x11]
+; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #760
+; CHECK-SD-NEXT:    add x10, sp, #824
+; CHECK-SD-NEXT:    add x11, sp, #1096
+; CHECK-SD-NEXT:    ld1 { v16.b }[6], [x13]
+; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #1032
+; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x9]
+; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x10]
 ; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b7, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #792
-; CHECK-SD-NEXT:    add x9, sp, #856
-; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #800
-; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #808
-; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #816
-; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #824
-; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #832
-; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #840
-; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
-; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x9]
-; CHECK-SD-NEXT:    add x8, sp, #864
-; CHECK-SD-NEXT:    add x9, sp, #16
-; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x9]
-; CHECK-SD-NEXT:    add x9, sp, #912
-; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #872
-; CHECK-SD-NEXT:    zip1 v0.2d, v6.2d, v7.2d
-; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x10]
-; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #880
-; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #888
-; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #896
-; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #904
-; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x8]
-; CHECK-SD-NEXT:    ldr b18, [x9]
-; CHECK-SD-NEXT:    add x8, sp, #920
-; CHECK-SD-NEXT:    ld1 { v18.b }[1], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #32
-; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #928
-; CHECK-SD-NEXT:    ld1 { v18.b }[2], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #768
+; CHECK-SD-NEXT:    add x10, sp, #832
 ; CHECK-SD-NEXT:    add x8, sp, #40
-; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #936
-; CHECK-SD-NEXT:    ld1 { v18.b }[3], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #48
-; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #944
-; CHECK-SD-NEXT:    ld1 { v18.b }[4], [x8]
+; CHECK-SD-NEXT:    ld1 { v16.b }[7], [x11]
+; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x12]
+; CHECK-SD-NEXT:    add x12, sp, #144
+; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x9]
+; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x10]
+; CHECK-SD-NEXT:    add x9, sp, #776
+; CHECK-SD-NEXT:    add x10, sp, #840
+; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v1.16b
+; CHECK-SD-NEXT:    ldr b5, [x12]
+; CHECK-SD-NEXT:    zip1 v6.2d, v6.2d, v16.2d
+; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #88
+; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
+; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #912
+; CHECK-SD-NEXT:    ldr b16, [x10]
+; CHECK-SD-NEXT:    add x9, sp, #152
+; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #856
+; CHECK-SD-NEXT:    add x10, sp, #920
+; CHECK-SD-NEXT:    add x11, sp, #48
+; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
+; CHECK-SD-NEXT:    zip1 v6.2d, v7.2d, v17.2d
+; CHECK-SD-NEXT:    ldr b7, [sp, #848]
+; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x9]
+; CHECK-SD-NEXT:    ld1 { v16.b }[1], [x10]
+; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x11]
+; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x8]
+; CHECK-SD-NEXT:    add x9, sp, #96
+; CHECK-SD-NEXT:    add x10, sp, #160
+; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #864
+; CHECK-SD-NEXT:    add x11, sp, #928
 ; CHECK-SD-NEXT:    add x8, sp, #56
-; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #952
-; CHECK-SD-NEXT:    ld1 { v18.b }[5], [x8]
+; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x10]
+; CHECK-SD-NEXT:    ld1 { v16.b }[2], [x11]
+; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x9]
+; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #104
+; CHECK-SD-NEXT:    add x9, sp, #168
+; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #872
+; CHECK-SD-NEXT:    add x10, sp, #936
+; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #112
+; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x8]
+; CHECK-SD-NEXT:    ld1 { v16.b }[3], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #176
+; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #880
+; CHECK-SD-NEXT:    add x11, sp, #944
 ; CHECK-SD-NEXT:    add x8, sp, #64
-; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #960
-; CHECK-SD-NEXT:    ld1 { v18.b }[6], [x8]
+; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x10]
+; CHECK-SD-NEXT:    add x10, sp, #952
+; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x9]
+; CHECK-SD-NEXT:    ld1 { v16.b }[4], [x11]
+; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #120
+; CHECK-SD-NEXT:    add x9, sp, #184
+; CHECK-SD-NEXT:    add x11, sp, #960
+; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #888
+; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
+; CHECK-SD-NEXT:    ld1 { v16.b }[5], [x10]
+; CHECK-SD-NEXT:    add x9, sp, #128
+; CHECK-SD-NEXT:    add x10, sp, #192
 ; CHECK-SD-NEXT:    add x8, sp, #72
-; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x8]
-; CHECK-SD-NEXT:    add x8, sp, #968
-; CHECK-SD-NEXT:    ld1 { v18.b }[7], [x8]
-; CHECK-SD-NEXT:    sdot v5.4s, v16.16b, v1.16b
-; CHECK-SD-NEXT:    zip1 v0.2d, v17.2d, v18.2d
-; CHECK-SD-NEXT:    sdot v5.4s, v2.16b, v1.16b
-; CHECK-SD-NEXT:    sdot v19.4s, v0.16b, v1.16b
-; CHECK-SD-NEXT:    add v0.4s, v5.4s, v19.4s
+; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
+; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x9]
+; CHECK-SD-NEXT:    add x9, sp, #896
+; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x10]
+; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x9]
+; CHECK-SD-NEXT:    ld1 { v16.b }[6], [x11]
+; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #136
+; CHECK-SD-NEXT:    add x9, sp, #200
+; CHECK-SD-NEXT:    add x10, sp, #968
+; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x8]
+; CHECK-SD-NEXT:    add x8, sp, #904
+; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x9]
+; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
+; CHECK-SD-NEXT:    ld1 { v16.b }[7], [x10]
+; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    zip1 v0.2d, v4.2d, v5.2d
+; CHECK-SD-NEXT:    zip1 v4.2d, v7.2d, v16.2d
+; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
+; CHECK-SD-NEXT:    sdot v2.4s, v4.16b, v1.16b
+; CHECK-SD-NEXT:    add v0.4s, v3.4s, v2.4s
 ; CHECK-SD-NEXT:    addv s0, v0.4s
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/nontemporal-store.ll b/llvm/test/CodeGen/AArch64/nontemporal-store.ll
index 677ae02417510..1ac7ec3d180c7 100644
--- a/llvm/test/CodeGen/AArch64/nontemporal-store.ll
+++ b/llvm/test/CodeGen/AArch64/nontemporal-store.ll
@@ -683,43 +683,43 @@ define void @test_stnp_v17f32(<17 x float> %v, ptr %ptr) {
 ;
 ; CHECK-BE-LABEL: test_stnp_v17f32:
 ; CHECK-BE:       // %bb.0: // %entry
-; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
-; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-BE-NEXT:    // kill: def $s4 killed $s4 def $q4
+; CHECK-BE-NEXT:    // kill: def $s0 killed $s0 def $q0
 ; CHECK-BE-NEXT:    // kill: def $s5 killed $s5 def $q5
+; CHECK-BE-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-BE-NEXT:    add x9, sp, #52
+; CHECK-BE-NEXT:    add x10, sp, #20
+; CHECK-BE-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-BE-NEXT:    // kill: def $s2 killed $s2 def $q2
 ; CHECK-BE-NEXT:    add x8, sp, #12
-; CHECK-BE-NEXT:    add x9, sp, #20
-; CHECK-BE-NEXT:    ldr s16, [sp, #36]
 ; CHECK-BE-NEXT:    mov v0.s[1], v1.s[0]
-; CHECK-BE-NEXT:    ldr s1, [sp, #4]
 ; CHECK-BE-NEXT:    mov v4.s[1], v5.s[0]
-; CHECK-BE-NEXT:    add x10, sp, #52
-; CHECK-BE-NEXT:    // kill: def $s6 killed $s6 def $q6
-; CHECK-BE-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-BE-NEXT:    ldr s5, [sp, #36]
+; CHECK-BE-NEXT:    ldr s16, [x9]
+; CHECK-BE-NEXT:    ldr s1, [sp, #4]
+; CHECK-BE-NEXT:    ldr s17, [x10]
+; CHECK-BE-NEXT:    add x9, sp, #44
+; CHECK-BE-NEXT:    add x10, sp, #60
 ; CHECK-BE-NEXT:    // kill: def $s7 killed $s7 def $q7
 ; CHECK-BE-NEXT:    // kill: def $s3 killed $s3 def $q3
-; CHECK-BE-NEXT:    ld1 { v1.s }[1], [x8]
-; CHECK-BE-NEXT:    ldr s5, [x9]
-; CHECK-BE-NEXT:    add x8, sp, #28
-; CHECK-BE-NEXT:    add x9, sp, #44
-; CHECK-BE-NEXT:    ld1 { v5.s }[1], [x8]
-; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x9]
-; CHECK-BE-NEXT:    ldr s17, [x10]
-; CHECK-BE-NEXT:    add x8, sp, #60
+; CHECK-BE-NEXT:    ld1 { v5.s }[1], [x9]
+; CHECK-BE-NEXT:    add x9, sp, #28
+; CHECK-BE-NEXT:    ld1 { v16.s }[1], [x10]
 ; CHECK-BE-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-BE-NEXT:    mov v0.s[2], v2.s[0]
-; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x8]
-; CHECK-BE-NEXT:    ldr s2, [sp, #68]
-; CHECK-BE-NEXT:    add x8, x0, #32
-; CHECK-BE-NEXT:    zip1 v1.2d, v1.2d, v5.2d
-; CHECK-BE-NEXT:    add x9, x0, #48
-; CHECK-BE-NEXT:    str s2, [x0, #64]
-; CHECK-BE-NEXT:    zip1 v5.2d, v16.2d, v17.2d
+; CHECK-BE-NEXT:    ld1 { v1.s }[1], [x8]
+; CHECK-BE-NEXT:    ld1 { v17.s }[1], [x9]
+; CHECK-BE-NEXT:    add x8, x0, #48
+; CHECK-BE-NEXT:    add x9, x0, #32
+; CHECK-BE-NEXT:    zip1 v2.2d, v5.2d, v16.2d
+; CHECK-BE-NEXT:    ldr s5, [sp, #68]
+; CHECK-BE-NEXT:    zip1 v1.2d, v1.2d, v17.2d
 ; CHECK-BE-NEXT:    mov v4.s[3], v7.s[0]
 ; CHECK-BE-NEXT:    mov v0.s[3], v3.s[0]
-; CHECK-BE-NEXT:    st1 { v1.4s }, [x8]
+; CHECK-BE-NEXT:    str s5, [x0, #64]
+; CHECK-BE-NEXT:    st1 { v2.4s }, [x8]
 ; CHECK-BE-NEXT:    add x8, x0, #16
-; CHECK-BE-NEXT:    st1 { v5.4s }, [x9]
+; CHECK-BE-NEXT:    st1 { v1.4s }, [x9]
 ; CHECK-BE-NEXT:    st1 { v4.4s }, [x8]
 ; CHECK-BE-NEXT:    st1 { v0.4s }, [x0]
 ; CHECK-BE-NEXT:    ret

From 13a628735fdb0023f1fdc477fed00dc68d7fdf11 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Thu, 14 May 2026 00:07:12 -0700
Subject: [PATCH 36/95] [AArch64][test] Fix use-after-scope in createInstrInfo
 (#197622)

https://github.com/llvm/llvm-project/pull/183506 revealed a pre-existing
use-after-scope in createInstrInfo (MSan bot:
https://lab.llvm.org/buildbot/#/builders/164/builds/21562 [*]).

This patch fixes the issue by changing the stack-allocated
AArch64Subtarget (which goes out of scope once createInstrInfo()
returns) into heap-allocated, allowing it to be safely stored in the
returned AArch64InstrInfo.

-----

[*] WARNING: MemorySanitizer: use-of-uninitialized-value
#0 0x55555666fabd in
llvm::AArch64InstrInfo::getInstSizeInBytes(llvm::MachineInstr const&)
const
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp:247:5
...

/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/unittests/Target/AArch64/InstSizes.cpp:85:3
#9 0x555556508559 in InstSizes_MOVaddrTagged_Test::TestBody()
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/unittests/Target/AArch64/InstSizes.cpp:301:3
...

  Member fields were destroyed
#0 0x555556498a1d in __sanitizer_dtor_callback_fields
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/compiler-rt/lib/msan/msan_interceptors.cpp:1074:5
#1 0x5555564fbda6 in ~Triple
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/include/llvm/TargetParser/Triple.h:348:12
#2 0x5555564fbda6 in ~Triple
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/include/llvm/TargetParser/Triple.h:47:7
#3 0x5555564fbda6 in llvm::AArch64Subtarget::~AArch64Subtarget()
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/lib/Target/AArch64/AArch64Subtarget.h:38:7
#4 0x555556503396 in (anonymous
namespace)::createInstrInfo(llvm::TargetMachine*)
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/unittests/Target/AArch64/InstSizes.cpp:38:1
#5 0x5555565084cb in InstSizes_MOVaddrTagged_Test::TestBody()
/home/b/sanitizer-x86_64-linux-bootstrap-msan/build/llvm-project/llvm/unittests/Target/AArch64/InstSizes.cpp:299:42
---
 llvm/unittests/Target/AArch64/InstSizes.cpp | 48 ++++++++++++---------
 1 file changed, 27 insertions(+), 21 deletions(-)

diff --git a/llvm/unittests/Target/AArch64/InstSizes.cpp b/llvm/unittests/Target/AArch64/InstSizes.cpp
index 9dffb6e600d62..898e3dae167b3 100644
--- a/llvm/unittests/Target/AArch64/InstSizes.cpp
+++ b/llvm/unittests/Target/AArch64/InstSizes.cpp
@@ -29,12 +29,18 @@ std::unique_ptr<TargetMachine> createTargetMachine() {
                                      std::nullopt, CodeGenOptLevel::Default));
 }
 
-std::unique_ptr<AArch64InstrInfo> createInstrInfo(TargetMachine *TM) {
-  AArch64Subtarget ST(TM->getTargetTriple(), std::string(TM->getTargetCPU()),
-                      std::string(TM->getTargetCPU()),
-                      std::string(TM->getTargetFeatureString()), *TM,
-                      /* isLittle */ false);
-  return std::make_unique<AArch64InstrInfo>(ST);
+std::pair<std::unique_ptr<AArch64Subtarget>, std::unique_ptr<AArch64InstrInfo>>
+createInstrInfo(TargetMachine *TM) {
+  auto ST = std::make_unique<AArch64Subtarget>(
+      TM->getTargetTriple(), std::string(TM->getTargetCPU()),
+      std::string(TM->getTargetCPU()),
+      std::string(TM->getTargetFeatureString()), *TM,
+      /* isLittle */ false);
+  // The AArch64InstrInfo constructor takes a const reference to *ST, hence we
+  // cannot stack allocate *ST.
+  auto II = std::make_unique<AArch64InstrInfo>(*ST);
+
+  return {std::move(ST), std::move(II)};
 }
 
 /// The \p InputIRSnippet is only needed for things that can't be expressed in
@@ -90,7 +96,7 @@ void runChecks(
 TEST(InstSizes, Authenticated) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   auto isAuthInst = [](AArch64InstrInfo &II, MachineFunction &MF) {
     auto I = MF.begin()->begin();
@@ -122,7 +128,7 @@ TEST(InstSizes, Authenticated) {
 TEST(InstSizes, STACKMAP) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
   ASSERT_TRUE(TM);
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "", "    STACKMAP 0, 16\n"
                                     "    STACKMAP 1, 32\n",
@@ -136,7 +142,7 @@ TEST(InstSizes, STACKMAP) {
 
 TEST(InstSizes, PATCHPOINT) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "    PATCHPOINT 0, 16, 0, 0, 0, csr_aarch64_aapcs\n"
@@ -151,7 +157,7 @@ TEST(InstSizes, PATCHPOINT) {
 
 TEST(InstSizes, STATEPOINT) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "    STATEPOINT 0, 0, 0, @sizes, 2, 0, 2, 0, 2, 0, 2, 1, 1, 8,"
@@ -164,7 +170,7 @@ TEST(InstSizes, STATEPOINT) {
 
 TEST(InstSizes, SPACE) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "    $xzr = SPACE 1024, undef $xzr\n"
@@ -179,7 +185,7 @@ TEST(InstSizes, SPACE) {
 
 TEST(InstSizes, TLSDESC_CALLSEQ) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(
       TM.get(), II.get(),
@@ -193,7 +199,7 @@ TEST(InstSizes, TLSDESC_CALLSEQ) {
 
 TEST(InstSizes, StoreSwiftAsyncContext) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(
       TM.get(), II.get(), "",
@@ -207,7 +213,7 @@ TEST(InstSizes, StoreSwiftAsyncContext) {
 
 TEST(InstSizes, SpeculationBarrierISBDSBEndBB) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(
       TM.get(), II.get(), "",
@@ -221,7 +227,7 @@ TEST(InstSizes, SpeculationBarrierISBDSBEndBB) {
 
 TEST(InstSizes, SpeculationBarrierSBEndBB) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(
       TM.get(), II.get(), "",
@@ -235,7 +241,7 @@ TEST(InstSizes, SpeculationBarrierSBEndBB) {
 
 TEST(InstSizes, JumpTable) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "    $x10, $x11 = JumpTableDest32 $x9, $x8, %jump-table.0\n"
@@ -253,7 +259,7 @@ TEST(InstSizes, JumpTable) {
 
 TEST(InstSizes, MOVaddr) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   auto Check8 = [](AArch64InstrInfo &II, MachineFunction &MF) {
     auto I = MF.begin()->begin();
@@ -296,7 +302,7 @@ TEST(InstSizes, MOVaddr) {
 
 TEST(InstSizes, MOVaddrTagged) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "  @g = external global i32\n",
             "    $x0 = MOVaddr target-flags(aarch64-page, aarch64-tagged) @g, "
@@ -309,7 +315,7 @@ TEST(InstSizes, MOVaddrTagged) {
 
 TEST(InstSizes, MOVi32imm) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "    $w0 = MOVi32imm 1\n"
@@ -324,7 +330,7 @@ TEST(InstSizes, MOVi32imm) {
 
 TEST(InstSizes, MOVi64imm) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "    $x0 = MOVi64imm 1\n"
@@ -342,7 +348,7 @@ TEST(InstSizes, MOVi64imm) {
 
 TEST(InstSizes, MOPSMemoryPseudos) {
   std::unique_ptr<TargetMachine> TM = createTargetMachine();
-  std::unique_ptr<AArch64InstrInfo> II = createInstrInfo(TM.get());
+  auto [ST, II] = createInstrInfo(TM.get());
 
   runChecks(TM.get(), II.get(), "",
             "  $x0, $x1, $x2 = MOPSMemoryMovePseudo $x0, $x1, $x2, "

From a41d58e018283e7e505782a4faf9444533575e09 Mon Sep 17 00:00:00 2001
From: Jiahao Guo <eoonguo@gmail.com>
Date: Thu, 14 May 2026 15:57:13 +0800
Subject: [PATCH 37/95] [CIR][AArch64] Lower NEON vtrn1/2 intrinsics (#197112)

### Summary

part of : https://github.com/llvm/llvm-project/issues/185382

Lower `vtrn1` and `vtrn2` intrinsics in
https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#transpose-elements

All the intrinsics are handled inline in
llvm-project/build/lib/clang/23/include/arm_neon.h like:

```
#ifdef __LITTLE_ENDIAN__
__ai __attribute__((target("neon"))) int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
  int8x8x2_t __ret;
  __builtin_neon_vtrn_v(&__ret, __builtin_bit_cast(int8x8_t, __p0), __builtin_bit_cast(int8x8_t, __p1), 0);
  return __ret;
}
#else
__ai __attribute__((target("neon"))) int8x8x2_t vtrn_s8(int8x8_t __p0, int8x8_t __p1) {
  int8x8x2_t __ret;
  int8x8_t __rev0;  __rev0 = __builtin_shufflevector(__p0, __p0, __lane_reverse_64_8);
  int8x8_t __rev1;  __rev1 = __builtin_shufflevector(__p1, __p1, __lane_reverse_64_8);
  __builtin_neon_vtrn_v(&__ret, __builtin_bit_cast(int8x8_t, __rev0), __builtin_bit_cast(int8x8_t, __rev1), 0);

  __ret.val[0] = __builtin_shufflevector(__ret.val[0], __ret.val[0], __lane_reverse_64_8);
  __ret.val[1] = __builtin_shufflevector(__ret.val[1], __ret.val[1], __lane_reverse_64_8);
  return __ret;
}
#endif
```

So no additional special lowering logic is needed.
---
 .../fp8-intrinsics/acle_neon_fp8_untyped.c    |  40 --
 clang/test/CodeGen/AArch64/neon-perm.c        | 422 +-------------
 clang/test/CodeGen/AArch64/neon/perm.c        | 533 ++++++++++++++++++
 clang/test/CodeGen/AArch64/poly64.c           |  20 -
 4 files changed, 534 insertions(+), 481 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c
index e386d2cca2cb1..40635342b8949 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c
@@ -611,46 +611,6 @@ mfloat8x16_t test_vdupq_laneq_mf8(mfloat8x16_t a) {
   return vdupq_laneq_mf8(a, 7);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_mf8(
-// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-mfloat8x8_t test_vtrn1_mf8(mfloat8x8_t a, mfloat8x8_t b) {
-  return vtrn1_mf8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_mf8(
-// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) {
-  return vtrn1q_mf8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_mf8(
-// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-mfloat8x8_t test_vtrn2_mf8(mfloat8x8_t a, mfloat8x8_t b) {
-  return vtrn2_mf8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_mf8(
-// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) {
-  return vtrn2q_mf8(a, b);
-}
-
 // CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl1_mf8(
 // CHECK-SAME: <16 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/neon-perm.c b/clang/test/CodeGen/AArch64/neon-perm.c
index 79cf97f10ae40..df8b526e47a1a 100644
--- a/clang/test/CodeGen/AArch64/neon-perm.c
+++ b/clang/test/CodeGen/AArch64/neon-perm.c
@@ -6,428 +6,8 @@
 
 #include <arm_neon.h>
 
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_s8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
-  return vtrn1_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_s8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
-  return vtrn1q_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_s16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
-//
-int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
-  return vtrn1_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
-  return vtrn1q_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_s32(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
-//
-int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
-  return vtrn1_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
-  return vtrn1q_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
-//
-int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
-  return vtrn1q_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_u8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
-  return vtrn1_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_u8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
-  return vtrn1q_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_u16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
-//
-uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
-  return vtrn1_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_u16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
-  return vtrn1q_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn1_u32(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
-//
-uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
-  return vtrn1_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn1q_u32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
-  return vtrn1q_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_u64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
-//
-uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
-  return vtrn1q_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x float> @test_vtrn1_f32(
-// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
-//
-float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
-  return vtrn1_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x float> @test_vtrn1q_f32(
-// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
-//
-float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
-  return vtrn1q_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x double> @test_vtrn1q_f64(
-// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
-//
-float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
-  return vtrn1q_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_p8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
-  return vtrn1_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_p8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
-  return vtrn1q_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn1_p16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
-// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
-//
-poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
-  return vtrn1_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn1q_p16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
-  return vtrn1q_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_s8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
-  return vtrn2_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_s8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
-  return vtrn2q_s8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_s16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
-//
-int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
-  return vtrn2_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_s16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
-  return vtrn2q_s16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_s32(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
-//
-int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
-  return vtrn2_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_s32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
-  return vtrn2q_s32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_s64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
-//
-int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
-  return vtrn2q_s64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_u8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
-  return vtrn2_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_u8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
-  return vtrn2q_u8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_u16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
-//
-uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
-  return vtrn2_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_u16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
-  return vtrn2q_u16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i32> @test_vtrn2_u32(
-// CHECK-SAME: <2 x i32> noundef [[A:%.*]], <2 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x i32> [[SHUFFLE_I]]
-//
-uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
-  return vtrn2_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i32> @test_vtrn2q_u32(
-// CHECK-SAME: <4 x i32> noundef [[A:%.*]], <4 x i32> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x i32> [[SHUFFLE_I]]
-//
-uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
-  return vtrn2q_u32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_u64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
-//
-uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
-  return vtrn2q_u64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x float> @test_vtrn2_f32(
-// CHECK-SAME: <2 x float> noundef [[A:%.*]], <2 x float> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x float> [[SHUFFLE_I]]
-//
-float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
-  return vtrn2_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x float> @test_vtrn2q_f32(
-// CHECK-SAME: <4 x float> noundef [[A:%.*]], <4 x float> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x float> [[SHUFFLE_I]]
-//
-float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
-  return vtrn2q_f32(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x double> @test_vtrn2q_f64(
-// CHECK-SAME: <2 x double> noundef [[A:%.*]], <2 x double> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x double> [[SHUFFLE_I]]
-//
-float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
-  return vtrn2q_f64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_p8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE_I]]
-//
-poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
-  return vtrn2_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_p8(
-// CHECK-SAME: <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
-// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE_I]]
-//
-poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
-  return vtrn2q_p8(a, b);
-}
-
-// CHECK-LABEL: define dso_local <4 x i16> @test_vtrn2_p16(
-// CHECK-SAME: <4 x i16> noundef [[A:%.*]], <4 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
-// CHECK-NEXT:    ret <4 x i16> [[SHUFFLE_I]]
-//
-poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
-  return vtrn2_p16(a, b);
-}
-
-// CHECK-LABEL: define dso_local <8 x i16> @test_vtrn2q_p16(
-// CHECK-SAME: <8 x i16> noundef [[A:%.*]], <8 x i16> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
-// CHECK-NEXT:    ret <8 x i16> [[SHUFFLE_I]]
-//
-poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
-  return vtrn2q_p16(a, b);
-}
-
 // CHECK-LABEL: define dso_local %struct.int8x8x2_t @test_vtrn_s8(
-// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
+// CHECK-SAME: <8 x i8> noundef [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
 // CHECK-NEXT:    [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
diff --git a/clang/test/CodeGen/AArch64/neon/perm.c b/clang/test/CodeGen/AArch64/neon/perm.c
index c90eb8290db55..419769ae3f0fa 100644
--- a/clang/test/CodeGen/AArch64/neon/perm.c
+++ b/clang/test/CodeGen/AArch64/neon/perm.c
@@ -1830,3 +1830,536 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // LLVM: ret %struct.uint16x8x2_t
 return vuzpq_u16(a, b);
 }
+
+//===------------------------------------------------------===//
+// 2.1.9.12.  Transpose elements
+// https://arm-software.github.io/acle/neon_intrinsics/advsimd.html#transpose-elements
+//===------------------------------------------------------===//
+
+// LLVM-LABEL: @test_vtrn1_s8(
+// CIR-LABEL: @vtrn1_s8(
+int8x8_t test_vtrn1_s8(int8x8_t a, int8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !s8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn1_s8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_s8(
+// CIR-LABEL: @vtrn1q_s8(
+int8x16_t test_vtrn1q_s8(int8x16_t a, int8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !s8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn1q_s8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_s16(
+// CIR-LABEL: @vtrn1_s16(
+int16x4_t test_vtrn1_s16(int16x4_t a, int16x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s16i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !s16i>
+
+// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// LLVM: ret <4 x i16> [[SHUFFLE]]
+  return vtrn1_s16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_s16(
+// CIR-LABEL: @vtrn1q_s16(
+int16x8_t test_vtrn1q_s16(int16x8_t a, int16x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !s16i>
+
+// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i16> [[SHUFFLE]]
+  return vtrn1q_s16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_s32(
+// CIR-LABEL: @vtrn1_s32(
+int32x2_t test_vtrn1_s32(int32x2_t a, int32x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !s32i>
+
+// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x i32> [[SHUFFLE]]
+  return vtrn1_s32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_s32(
+// CIR-LABEL: @vtrn1q_s32(
+int32x4_t test_vtrn1q_s32(int32x4_t a, int32x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !s32i>
+
+// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// LLVM: ret <4 x i32> [[SHUFFLE]]
+  return vtrn1q_s32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_s64(
+// CIR-LABEL: @vtrn1q_s64(
+int64x2_t test_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !s64i>
+
+// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x i64> [[SHUFFLE]]
+  return vtrn1q_s64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_u8(
+// CIR-LABEL: @vtrn1_u8(
+uint8x8_t test_vtrn1_u8(uint8x8_t a, uint8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn1_u8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_u8(
+// CIR-LABEL: @vtrn1q_u8(
+uint8x16_t test_vtrn1q_u8(uint8x16_t a, uint8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn1q_u8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_u16(
+// CIR-LABEL: @vtrn1_u16(
+uint16x4_t test_vtrn1_u16(uint16x4_t a, uint16x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u16i>
+
+// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// LLVM: ret <4 x i16> [[SHUFFLE]]
+  return vtrn1_u16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_u16(
+// CIR-LABEL: @vtrn1q_u16(
+uint16x8_t test_vtrn1q_u16(uint16x8_t a, uint16x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u16i>
+
+// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i16> [[SHUFFLE]]
+  return vtrn1q_u16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_u32(
+// CIR-LABEL: @vtrn1_u32(
+uint32x2_t test_vtrn1_u32(uint32x2_t a, uint32x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u32i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u32i>
+
+// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x i32> [[SHUFFLE]]
+  return vtrn1_u32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_u32(
+// CIR-LABEL: @vtrn1q_u32(
+uint32x4_t test_vtrn1q_u32(uint32x4_t a, uint32x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u32i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u32i>
+
+// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// LLVM: ret <4 x i32> [[SHUFFLE]]
+  return vtrn1q_u32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_u64(
+// CIR-LABEL: @vtrn1q_u64(
+uint64x2_t test_vtrn1q_u64(uint64x2_t a, uint64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u64i>
+
+// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x i64> [[SHUFFLE]]
+  return vtrn1q_u64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_p64(
+// CIR-LABEL: @vtrn1q_p64(
+poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !u64i>
+
+// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x i64> [[SHUFFLE]]
+  return vtrn1q_p64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_f32(
+// CIR-LABEL: @vtrn1_f32(
+float32x2_t test_vtrn1_f32(float32x2_t a, float32x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x float> [[SHUFFLE]]
+  return vtrn1_f32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_f32(
+// CIR-LABEL: @vtrn1q_f32(
+float32x4_t test_vtrn1q_f32(float32x4_t a, float32x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// LLVM: ret <4 x float> [[SHUFFLE]]
+  return vtrn1q_f32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_f64(
+// CIR-LABEL: @vtrn1q_f64(
+float64x2_t test_vtrn1q_f64(float64x2_t a, float64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<0> : !s64i, #cir.int<2> : !s64i] : !cir.vector<2 x !cir.double>
+
+// LLVM-SAME: <2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 0, i32 2>
+// LLVM: ret <2 x double> [[SHUFFLE]]
+  return vtrn1q_f64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_p8(
+// CIR-LABEL: @vtrn1_p8(
+poly8x8_t test_vtrn1_p8(poly8x8_t a, poly8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn1_p8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_p8(
+// CIR-LABEL: @vtrn1q_p8(
+poly8x16_t test_vtrn1q_p8(poly8x16_t a, poly8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn1q_p8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_p16(
+// CIR-LABEL: @vtrn1_p16(
+poly16x4_t test_vtrn1_p16(poly16x4_t a, poly16x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<0> : !s64i, #cir.int<4> : !s64i, #cir.int<2> : !s64i, #cir.int<6> : !s64i] : !cir.vector<4 x !u16i>
+
+// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+// LLVM: ret <4 x i16> [[SHUFFLE]]
+  return vtrn1_p16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_p16(
+// CIR-LABEL: @vtrn1q_p16(
+poly16x8_t test_vtrn1q_p16(poly16x8_t a, poly16x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u16i>
+
+// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i16> [[SHUFFLE]]
+  return vtrn1q_p16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1_mf8(
+// CIR-LABEL: @vtrn1_mf8(
+mfloat8x8_t test_vtrn1_mf8(mfloat8x8_t a, mfloat8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<0> : !s64i, #cir.int<8> : !s64i, #cir.int<2> : !s64i, #cir.int<10> : !s64i, #cir.int<4> : !s64i, #cir.int<12> : !s64i, #cir.int<6> : !s64i, #cir.int<14> : !s64i] : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn1_mf8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn1q_mf8(
+// CIR-LABEL: @vtrn1q_mf8(
+mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<0> : !s64i, #cir.int<16> : !s64i, #cir.int<2> : !s64i, #cir.int<18> : !s64i, #cir.int<4> : !s64i, #cir.int<20> : !s64i, #cir.int<6> : !s64i, #cir.int<22> : !s64i, #cir.int<8> : !s64i, #cir.int<24> : !s64i, #cir.int<10> : !s64i, #cir.int<26> : !s64i, #cir.int<12> : !s64i, #cir.int<28> : !s64i, #cir.int<14> : !s64i, #cir.int<30> : !s64i] : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn1q_mf8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_s8(
+// CIR-LABEL: @vtrn2_s8(
+int8x8_t test_vtrn2_s8(int8x8_t a, int8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !s8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn2_s8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_s8(
+// CIR-LABEL: @vtrn2q_s8(
+int8x16_t test_vtrn2q_s8(int8x16_t a, int8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !s8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !s8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn2q_s8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_s16(
+// CIR-LABEL: @vtrn2_s16(
+int16x4_t test_vtrn2_s16(int16x4_t a, int16x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s16i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !s16i>
+
+// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// LLVM: ret <4 x i16> [[SHUFFLE]]
+  return vtrn2_s16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_s16(
+// CIR-LABEL: @vtrn2q_s16(
+int16x8_t test_vtrn2q_s16(int16x8_t a, int16x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !s16i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !s16i>
+
+// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i16> [[SHUFFLE]]
+  return vtrn2q_s16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_s32(
+// CIR-LABEL: @vtrn2_s32(
+int32x2_t test_vtrn2_s32(int32x2_t a, int32x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !s32i>
+
+// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x i32> [[SHUFFLE]]
+  return vtrn2_s32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_s32(
+// CIR-LABEL: @vtrn2q_s32(
+int32x4_t test_vtrn2q_s32(int32x4_t a, int32x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !s32i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !s32i>
+
+// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// LLVM: ret <4 x i32> [[SHUFFLE]]
+  return vtrn2q_s32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_s64(
+// CIR-LABEL: @vtrn2q_s64(
+int64x2_t test_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !s64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !s64i>
+
+// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x i64> [[SHUFFLE]]
+  return vtrn2q_s64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_u8(
+// CIR-LABEL: @vtrn2_u8(
+uint8x8_t test_vtrn2_u8(uint8x8_t a, uint8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn2_u8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_u8(
+// CIR-LABEL: @vtrn2q_u8(
+uint8x16_t test_vtrn2q_u8(uint8x16_t a, uint8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn2q_u8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_u16(
+// CIR-LABEL: @vtrn2_u16(
+uint16x4_t test_vtrn2_u16(uint16x4_t a, uint16x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u16i>
+
+// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// LLVM: ret <4 x i16> [[SHUFFLE]]
+  return vtrn2_u16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_u16(
+// CIR-LABEL: @vtrn2q_u16(
+uint16x8_t test_vtrn2q_u16(uint16x8_t a, uint16x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u16i>
+
+// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i16> [[SHUFFLE]]
+  return vtrn2q_u16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_u32(
+// CIR-LABEL: @vtrn2_u32(
+uint32x2_t test_vtrn2_u32(uint32x2_t a, uint32x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u32i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u32i>
+
+// LLVM-SAME: <2 x i32>{{.*}}[[A:%.*]], <2 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i32> [[A]], <2 x i32> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x i32> [[SHUFFLE]]
+  return vtrn2_u32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_u32(
+// CIR-LABEL: @vtrn2q_u32(
+uint32x4_t test_vtrn2q_u32(uint32x4_t a, uint32x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u32i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u32i>
+
+// LLVM-SAME: <4 x i32>{{.*}}[[A:%.*]], <4 x i32>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// LLVM: ret <4 x i32> [[SHUFFLE]]
+  return vtrn2q_u32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_u64(
+// CIR-LABEL: @vtrn2q_u64(
+uint64x2_t test_vtrn2q_u64(uint64x2_t a, uint64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u64i>
+
+// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x i64> [[SHUFFLE]]
+  return vtrn2q_u64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_p64(
+// CIR-LABEL: @vtrn2q_p64(
+poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !u64i>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !u64i>
+
+// LLVM-SAME: <2 x i64>{{.*}}[[A:%.*]], <2 x i64>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x i64> [[SHUFFLE]]
+  return vtrn2q_p64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_f32(
+// CIR-LABEL: @vtrn2_f32(
+float32x2_t test_vtrn2_f32(float32x2_t a, float32x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.float>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !cir.float>
+
+// LLVM-SAME: <2 x float>{{.*}}[[A:%.*]], <2 x float>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[A]], <2 x float> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x float> [[SHUFFLE]]
+  return vtrn2_f32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_f32(
+// CIR-LABEL: @vtrn2q_f32(
+float32x4_t test_vtrn2q_f32(float32x4_t a, float32x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !cir.float>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !cir.float>
+
+// LLVM-SAME: <4 x float>{{.*}}[[A:%.*]], <4 x float>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// LLVM: ret <4 x float> [[SHUFFLE]]
+  return vtrn2q_f32(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_f64(
+// CIR-LABEL: @vtrn2q_f64(
+float64x2_t test_vtrn2q_f64(float64x2_t a, float64x2_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<2 x !cir.double>) [#cir.int<1> : !s64i, #cir.int<3> : !s64i] : !cir.vector<2 x !cir.double>
+
+// LLVM-SAME: <2 x double>{{.*}}[[A:%.*]], <2 x double>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+// LLVM: ret <2 x double> [[SHUFFLE]]
+  return vtrn2q_f64(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_p8(
+// CIR-LABEL: @vtrn2_p8(
+poly8x8_t test_vtrn2_p8(poly8x8_t a, poly8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn2_p8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_p8(
+// CIR-LABEL: @vtrn2q_p8(
+poly8x16_t test_vtrn2q_p8(poly8x16_t a, poly8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn2q_p8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_p16(
+// CIR-LABEL: @vtrn2_p16(
+poly16x4_t test_vtrn2_p16(poly16x4_t a, poly16x4_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<4 x !u16i>) [#cir.int<1> : !s64i, #cir.int<5> : !s64i, #cir.int<3> : !s64i, #cir.int<7> : !s64i] : !cir.vector<4 x !u16i>
+
+// LLVM-SAME: <4 x i16>{{.*}}[[A:%.*]], <4 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+// LLVM: ret <4 x i16> [[SHUFFLE]]
+  return vtrn2_p16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_p16(
+// CIR-LABEL: @vtrn2q_p16(
+poly16x8_t test_vtrn2q_p16(poly16x8_t a, poly16x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u16i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u16i>
+
+// LLVM-SAME: <8 x i16>{{.*}}[[A:%.*]], <8 x i16>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i16> [[SHUFFLE]]
+  return vtrn2q_p16(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2_mf8(
+// CIR-LABEL: @vtrn2_mf8(
+mfloat8x8_t test_vtrn2_mf8(mfloat8x8_t a, mfloat8x8_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<8 x !u8i>) [#cir.int<1> : !s64i, #cir.int<9> : !s64i, #cir.int<3> : !s64i, #cir.int<11> : !s64i, #cir.int<5> : !s64i, #cir.int<13> : !s64i, #cir.int<7> : !s64i, #cir.int<15> : !s64i] : !cir.vector<8 x !u8i>
+
+// LLVM-SAME: <8 x i8>{{.*}}[[A:%.*]], <8 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+// LLVM: ret <8 x i8> [[SHUFFLE]]
+  return vtrn2_mf8(a, b);
+}
+
+// LLVM-LABEL: @test_vtrn2q_mf8(
+// CIR-LABEL: @vtrn2q_mf8(
+mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) {
+// CIR: cir.vec.shuffle(%{{.*}}, %{{.*}} : !cir.vector<16 x !u8i>) [#cir.int<1> : !s64i, #cir.int<17> : !s64i, #cir.int<3> : !s64i, #cir.int<19> : !s64i, #cir.int<5> : !s64i, #cir.int<21> : !s64i, #cir.int<7> : !s64i, #cir.int<23> : !s64i, #cir.int<9> : !s64i, #cir.int<25> : !s64i, #cir.int<11> : !s64i, #cir.int<27> : !s64i, #cir.int<13> : !s64i, #cir.int<29> : !s64i, #cir.int<15> : !s64i, #cir.int<31> : !s64i] : !cir.vector<16 x !u8i>
+
+// LLVM-SAME: <16 x i8>{{.*}}[[A:%.*]], <16 x i8>{{.*}}[[B:%.*]])
+// LLVM: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+// LLVM: ret <16 x i8> [[SHUFFLE]]
+  return vtrn2q_mf8(a, b);
+}
diff --git a/clang/test/CodeGen/AArch64/poly64.c b/clang/test/CodeGen/AArch64/poly64.c
index 1a7eceefa6a58..50617f531e6a1 100644
--- a/clang/test/CodeGen/AArch64/poly64.c
+++ b/clang/test/CodeGen/AArch64/poly64.c
@@ -515,26 +515,6 @@ poly64x2_t test_vextq_p64(poly64x2_t a, poly64x2_t b) {
   return vextq_p64(a, b, 1);
 }
 
-// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn1q_p64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 0, i32 2>
-// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
-//
-poly64x2_t test_vtrn1q_p64(poly64x2_t a, poly64x2_t b) {
-  return vtrn1q_p64(a, b);
-}
-
-// CHECK-LABEL: define dso_local <2 x i64> @test_vtrn2q_p64(
-// CHECK-SAME: <2 x i64> noundef [[A:%.*]], <2 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[SHUFFLE_I:%.*]] = shufflevector <2 x i64> [[A]], <2 x i64> [[B]], <2 x i32> <i32 1, i32 3>
-// CHECK-NEXT:    ret <2 x i64> [[SHUFFLE_I]]
-//
-poly64x2_t test_vtrn2q_p64(poly64x2_t a, poly64x2_t b) {
-  return vtrn2q_u64(a, b);
-}
-
 // CHECK-LABEL: define dso_local <1 x i64> @test_vsri_n_p64(
 // CHECK-SAME: <1 x i64> noundef [[A:%.*]], <1 x i64> noundef [[B:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]

From f098a2272403ac08ffa630cc9b62f7f37c6191d7 Mon Sep 17 00:00:00 2001
From: Dmitrii Lebed <lebed.dmitry@gmail.com>
Date: Thu, 14 May 2026 01:03:10 -0700
Subject: [PATCH 38/95] [clang-format] Add BreakBeforeReturnType option
 (#197268)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In certain codebases (e.g. embedded) — function declarations could
accumulate a long prefix of specifiers and attributes (`static`,
`inline`, `__attribute__((...))`, project-specific `AttributeMacros`,
etc.) before the return type, which buries the core prototype and pushes
parameters past the column limit.

This patch adds a `BreakBeforeReturnType` style option that places that
prefix on its own line(s):

```cpp
__attribute__((always_inline)) static inline
int do_thing(int a, int b, int c);
```

The recognized prefix tokens are function/storage specifiers (`static`,
`extern`, `inline`, `virtual`, `constexpr`, `consteval`, `friend`,
`export`, `_Noreturn`, `__forceinline`), C++11 attribute groups
`[[...]]`, GNU/MSVC attribute groups `__attribute__((...))` /
`__declspec(...)`, and identifiers configured via `AttributeMacros`.

The new `BreakBeforeReturnTypeStyle` enum has values `None`, `All`,
`TopLevel`, `AllDefinitions`, and `TopLevelDefinitions`. The default is
`None`, preserving previous behavior. Constructors and destructors are
not affected. The option composes with `BreakAfterReturnType`,
`BreakAfterAttributes`, and `BreakTemplateDeclarations`.

`ContinuationIndenter::getNewLineColumn` is adjusted so the wrapped
return type is dedented to the line's base indent when the preceding
token is a function/storage specifier keyword, matching the behavior
already used after attribute groups.

Adds tests in `FormatTest.cpp`.

Assisted-by: Claude (claude-opus-4-7, Claude Code)
---
 clang/docs/ClangFormatStyleOptions.rst     |  32 ++++
 clang/docs/ReleaseNotes.rst                |   2 +
 clang/include/clang/Format/Format.h        |  26 ++++
 clang/lib/Format/ContinuationIndenter.cpp  |   4 +-
 clang/lib/Format/Format.cpp                |  15 ++
 clang/lib/Format/FormatToken.h             |   9 ++
 clang/lib/Format/TokenAnnotator.cpp        |  72 +++++++++
 clang/lib/Format/TokenAnnotator.h          |   2 +
 clang/unittests/Format/ConfigParseTest.cpp |  12 ++
 clang/unittests/Format/FormatTest.cpp      | 166 +++++++++++++++++++++
 10 files changed, 339 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index f852f76f5038c..7b1b7a7384b07 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -3694,6 +3694,38 @@ the configuration (without a prefix: ``Auto``).
 
 
 
+.. _BreakBeforeReturnType:
+
+**BreakBeforeReturnType** (``BreakBeforeReturnTypeStyle``) :versionbadge:`clang-format 23` :ref:`¶ <BreakBeforeReturnType>`
+  The function declaration/definition return type breaking style to use.
+  Trailing return types (``auto f() -> T``) are not affected. To have
+  identifier macros (e.g. ``__always_inline``) treated as specifiers,
+  add them to ``AttributeMacros``.
+
+  Possible values:
+
+  * ``BBRTS_None`` (in configuration: ``None``)
+    Do not force a break before the return type.
+
+  * ``BBRTS_All`` (in configuration: ``All``)
+    Always break before the return type.
+
+    .. code-block:: c++
+
+      static inline
+      void f();
+
+  * ``BBRTS_TopLevel`` (in configuration: ``TopLevel``)
+    Break before the return type of top-level functions only.
+
+  * ``BBRTS_AllDefinitions`` (in configuration: ``AllDefinitions``)
+    Break before the return type of function definitions only.
+
+  * ``BBRTS_TopLevelDefinitions`` (in configuration: ``TopLevelDefinitions``)
+    Break before the return type of top-level definitions only.
+
+
+
 .. _BreakBeforeTemplateCloser:
 
 **BreakBeforeTemplateCloser** (``Boolean``) :versionbadge:`clang-format 21` :ref:`¶ <BreakBeforeTemplateCloser>`
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b49286b35c6b0..a9884beee2710 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -769,6 +769,8 @@ clang-format
   declaration parameters.
 - Add ``EnumAssignments`` option to ``AlignConsecutiveAssignments`` for aligning
   enum assignments without affecting other assignments.
+- Add ``BreakBeforeReturnType`` option to break before the function return
+  type.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index eca3cc44c41b6..27b2d8f4a405b 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -2462,6 +2462,31 @@ struct FormatStyle {
   /// \version 16
   BreakBeforeInlineASMColonStyle BreakBeforeInlineASMColon;
 
+  /// Different ways to break before the function return type.
+  enum BreakBeforeReturnTypeStyle : int8_t {
+    /// Do not force a break before the return type.
+    BBRTS_None,
+    /// Always break before the return type.
+    /// \code
+    ///   static inline
+    ///   void f();
+    /// \endcode
+    BBRTS_All,
+    /// Break before the return type of top-level functions only.
+    BBRTS_TopLevel,
+    /// Break before the return type of function definitions only.
+    BBRTS_AllDefinitions,
+    /// Break before the return type of top-level definitions only.
+    BBRTS_TopLevelDefinitions,
+  };
+
+  /// The function declaration/definition return type breaking style to use.
+  /// Trailing return types (``auto f() -> T``) are not affected. To have
+  /// identifier macros (e.g. ``__always_inline``) treated as specifiers,
+  /// add them to ``AttributeMacros``.
+  /// \version 23
+  BreakBeforeReturnTypeStyle BreakBeforeReturnType;
+
   /// If ``true``, break before a template closing bracket (``>``) when there is
   /// a line break after the matching opening bracket (``<``).
   /// \code
@@ -6092,6 +6117,7 @@ struct FormatStyle {
            BreakBeforeCloseBracketSwitch == R.BreakBeforeCloseBracketSwitch &&
            BreakBeforeConceptDeclarations == R.BreakBeforeConceptDeclarations &&
            BreakBeforeInlineASMColon == R.BreakBeforeInlineASMColon &&
+           BreakBeforeReturnType == R.BreakBeforeReturnType &&
            BreakBeforeTemplateCloser == R.BreakBeforeTemplateCloser &&
            BreakBeforeTernaryOperators == R.BreakBeforeTernaryOperators &&
            BreakBinaryOperations == R.BreakBinaryOperations &&
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 338515ec6da21..361072127f8e1 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -1654,7 +1654,9 @@ ContinuationIndenter::getNewLineColumn(const LineState &State) {
                                     TT_JavaAnnotation,
                                     TT_LeadingJavaAnnotation))) ||
       (!Style.IndentWrappedFunctionNames &&
-       NextNonComment->isOneOf(tok::kw_operator, TT_FunctionDeclarationName))) {
+       NextNonComment->isOneOf(tok::kw_operator, TT_FunctionDeclarationName)) ||
+      (State.Line->ReturnTypeWrapped && PreviousNonComment &&
+       isReturnTypePrefixSpecifier(*PreviousNonComment))) {
     return std::max(IndentationAndAlignment(CurrentState.LastSpace),
                     CurrentState.Indent);
   }
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index ec0ad98f37753..a29d62c99bb95 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -749,6 +749,19 @@ struct ScalarEnumerationTraits<FormatStyle::ReturnTypeBreakingStyle> {
   }
 };
 
+template <>
+struct ScalarEnumerationTraits<FormatStyle::BreakBeforeReturnTypeStyle> {
+  static void enumeration(IO &IO,
+                          FormatStyle::BreakBeforeReturnTypeStyle &Value) {
+    IO.enumCase(Value, "None", FormatStyle::BBRTS_None);
+    IO.enumCase(Value, "All", FormatStyle::BBRTS_All);
+    IO.enumCase(Value, "TopLevel", FormatStyle::BBRTS_TopLevel);
+    IO.enumCase(Value, "AllDefinitions", FormatStyle::BBRTS_AllDefinitions);
+    IO.enumCase(Value, "TopLevelDefinitions",
+                FormatStyle::BBRTS_TopLevelDefinitions);
+  }
+};
+
 template <>
 struct ScalarEnumerationTraits<FormatStyle::SeparateDefinitionStyle> {
   static void enumeration(IO &IO, FormatStyle::SeparateDefinitionStyle &Value) {
@@ -1317,6 +1330,7 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("BreakBeforeBraces", Style.BreakBeforeBraces);
     IO.mapOptional("BreakBeforeInlineASMColon",
                    Style.BreakBeforeInlineASMColon);
+    IO.mapOptional("BreakBeforeReturnType", Style.BreakBeforeReturnType);
     IO.mapOptional("BreakBeforeTemplateCloser",
                    Style.BreakBeforeTemplateCloser);
     IO.mapOptional("BreakBeforeTernaryOperators",
@@ -1889,6 +1903,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.BreakBeforeCloseBracketSwitch = false;
   LLVMStyle.BreakBeforeConceptDeclarations = FormatStyle::BBCDS_Always;
   LLVMStyle.BreakBeforeInlineASMColon = FormatStyle::BBIAS_OnlyMultiline;
+  LLVMStyle.BreakBeforeReturnType = FormatStyle::BBRTS_None;
   LLVMStyle.BreakBeforeTemplateCloser = false;
   LLVMStyle.BreakBeforeTernaryOperators = true;
   LLVMStyle.BreakBinaryOperations = {FormatStyle::BBO_Never, {}};
diff --git a/clang/lib/Format/FormatToken.h b/clang/lib/Format/FormatToken.h
index 1d8f0f1cfe412..7f6721a87877a 100644
--- a/clang/lib/Format/FormatToken.h
+++ b/clang/lib/Format/FormatToken.h
@@ -2132,6 +2132,15 @@ inline bool continuesLineComment(const FormatToken &FormatTok,
 // Returns \c true if \c Current starts a new parameter.
 bool startsNextParameter(const FormatToken &Current, const FormatStyle &Style);
 
+// Returns \c true if \c Tok is a function/storage specifier that may appear
+// before a function return type (e.g. ``static``, ``inline``, ``constexpr``).
+inline bool isReturnTypePrefixSpecifier(const FormatToken &Tok) {
+  return Tok.isOneOf(tok::kw_static, tok::kw_extern, tok::kw_inline,
+                     tok::kw_virtual, tok::kw_constexpr, tok::kw_consteval,
+                     tok::kw_friend, tok::kw_export, tok::kw__Noreturn,
+                     tok::kw___forceinline);
+}
+
 } // namespace format
 } // namespace clang
 
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index afdb59617fb2a..9181e0e9d5a2a 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4062,6 +4062,67 @@ bool TokenAnnotator::mustBreakForReturnType(const AnnotatedLine &Line) const {
   return false;
 }
 
+bool TokenAnnotator::mustBreakBeforeReturnType(
+    const AnnotatedLine &Line) const {
+  assert(Line.MightBeFunctionDecl);
+
+  switch (Style.BreakBeforeReturnType) {
+  case FormatStyle::BBRTS_None:
+    return false;
+  case FormatStyle::BBRTS_All:
+    return true;
+  case FormatStyle::BBRTS_TopLevel:
+    return Line.Level == 0;
+  case FormatStyle::BBRTS_AllDefinitions:
+    return Line.mightBeFunctionDefinition();
+  case FormatStyle::BBRTS_TopLevelDefinitions:
+    return Line.Level == 0 && Line.mightBeFunctionDefinition();
+  }
+
+  return false;
+}
+
+static FormatToken *findReturnTypeStart(const AnnotatedLine &Line) {
+  auto *Tok = Line.getFirstNonComment();
+  if (!Tok)
+    return nullptr;
+
+  if (Tok->is(tok::kw_template)) {
+    auto *Opener = Tok->Next;
+    while (Opener && Opener->isNot(TT_TemplateOpener))
+      Opener = Opener->Next;
+    if (!Opener || !Opener->MatchingParen)
+      return nullptr;
+    Tok = Opener->MatchingParen->Next;
+  }
+
+  if (Tok && Tok->is(TT_RequiresClause)) {
+    while (Tok && !Tok->ClosesRequiresClause)
+      Tok = Tok->Next;
+    if (Tok)
+      Tok = Tok->Next;
+  }
+
+  while (Tok) {
+    if (isReturnTypePrefixSpecifier(*Tok) ||
+        Tok->isOneOf(tok::kw___attribute, tok::kw___declspec,
+                     TT_AttributeMacro)) {
+      auto *Next = Tok->Next;
+      if (Next && Next->is(tok::l_paren) && Next->MatchingParen)
+        Tok = Next->MatchingParen->Next;
+      else
+        Tok = Next;
+      continue;
+    }
+    if (Tok->is(TT_AttributeLSquare) && Tok->MatchingParen) {
+      Tok = Tok->MatchingParen->Next;
+      continue;
+    }
+    break;
+  }
+  return Tok;
+}
+
 void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
   if (Line.Computed)
     return;
@@ -4180,6 +4241,17 @@ void TokenAnnotator::calculateFormattingInformation(AnnotatedLine &Line) const {
     }
   }
 
+  if (Line.MightBeFunctionDecl && LineIsFunctionDeclaration &&
+      mustBreakBeforeReturnType(Line)) {
+    if (auto *ReturnTypeStart = findReturnTypeStart(Line);
+        ReturnTypeStart && ReturnTypeStart != FirstNonComment &&
+        ReturnTypeStart->isNoneOf(TT_FunctionDeclarationName,
+                                  TT_CtorDtorDeclName, tok::tilde)) {
+      ReturnTypeStart->MustBreakBefore = true;
+      Line.ReturnTypeWrapped = true;
+    }
+  }
+
   if (First->is(TT_ElseLBrace)) {
     First->CanBreakBefore = true;
     First->MustBreakBefore = true;
diff --git a/clang/lib/Format/TokenAnnotator.h b/clang/lib/Format/TokenAnnotator.h
index 33c7df9d0f949..52d6e5ca56915 100644
--- a/clang/lib/Format/TokenAnnotator.h
+++ b/clang/lib/Format/TokenAnnotator.h
@@ -256,6 +256,8 @@ class TokenAnnotator {
 
   bool mustBreakForReturnType(const AnnotatedLine &Line) const;
 
+  bool mustBreakBeforeReturnType(const AnnotatedLine &Line) const;
+
   void printDebugInfo(const AnnotatedLine &Line) const;
 
   void calculateUnbreakableTailLengths(AnnotatedLine &Line) const;
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index f731922030777..eeaf5d3f66d96 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -927,6 +927,18 @@ TEST(ConfigParseTest, ParsesConfiguration) {
   CHECK_PARSE("AlwaysBreakAfterReturnType: TopLevelDefinitions",
               BreakAfterReturnType, FormatStyle::RTBS_TopLevelDefinitions);
 
+  Style.BreakBeforeReturnType = FormatStyle::BBRTS_All;
+  CHECK_PARSE("BreakBeforeReturnType: None", BreakBeforeReturnType,
+              FormatStyle::BBRTS_None);
+  CHECK_PARSE("BreakBeforeReturnType: All", BreakBeforeReturnType,
+              FormatStyle::BBRTS_All);
+  CHECK_PARSE("BreakBeforeReturnType: TopLevel", BreakBeforeReturnType,
+              FormatStyle::BBRTS_TopLevel);
+  CHECK_PARSE("BreakBeforeReturnType: AllDefinitions", BreakBeforeReturnType,
+              FormatStyle::BBRTS_AllDefinitions);
+  CHECK_PARSE("BreakBeforeReturnType: TopLevelDefinitions",
+              BreakBeforeReturnType, FormatStyle::BBRTS_TopLevelDefinitions);
+
   Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
   CHECK_PARSE("BreakTemplateDeclarations: Leave", BreakTemplateDeclarations,
               FormatStyle::BTDS_Leave);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 4245bd1c58153..dbc8a00ad1c9b 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -10594,6 +10594,172 @@ TEST_F(FormatTest, ReturnTypeBreakingStyle) {
   verifyFormat("void foo (int a, int b);", Style);
 }
 
+TEST_F(FormatTest, BreakBeforeReturnType) {
+  FormatStyle Style = getLLVMStyle();
+  Style.BreakBeforeReturnType = FormatStyle::BBRTS_All;
+
+  verifyFormat("static inline\n"
+               "void myfun(void);",
+               Style);
+  verifyFormat("static\n"
+               "int x(void);",
+               Style);
+
+  verifyFormat("void f(void);", Style);
+  verifyFormat("int g(int a);", Style);
+
+  // Constructors and destructors are not affected.
+  verifyFormat("class C {\n"
+               "  explicit C(int);\n"
+               "  virtual ~C();\n"
+               "};",
+               Style);
+
+  verifyFormat("__attribute__((always_inline)) static inline\n"
+               "void f(void);",
+               Style);
+  verifyFormat("static __forceinline\n"
+               "void f(void);",
+               Style);
+  verifyFormat("export\n"
+               "int f();",
+               Style);
+  verifyFormat(
+      "__attribute__((section(\".init\"), always_inline)) static inline\n"
+      "int boot(void);",
+      Style);
+  verifyFormat("[[nodiscard]] static constexpr\n"
+               "int f();",
+               Style);
+  verifyFormat("static\n"
+               "const struct foo *g(void);",
+               Style);
+  verifyFormat("class A {\n"
+               "  friend\n"
+               "  int f();\n"
+               "};",
+               Style);
+
+  verifyFormat("static int x = 0;", Style);
+  verifyFormat("static const char *msg;", Style);
+
+  verifyFormat("static\n"
+               "auto f() -> int;",
+               Style);
+
+  Style.ColumnLimit = 50;
+  verifyFormat("__attribute__((always_inline)) static inline\n"
+               "int do_thing(int a, int b, int c);",
+               Style);
+  Style.ColumnLimit = 80;
+
+  verifyFormat("static inline\n"
+               "int compute(int x) {\n"
+               "  ++x;\n"
+               "  return x;\n"
+               "}",
+               Style);
+
+  Style.BreakAfterReturnType = FormatStyle::RTBS_All;
+  verifyFormat("static inline\n"
+               "void\n"
+               "f(void);",
+               Style);
+  Style.BreakAfterReturnType = FormatStyle::RTBS_None;
+
+  Style.BreakAfterAttributes = FormatStyle::ABS_Always;
+  verifyFormat("[[nodiscard]]\n"
+               "static\n"
+               "int f();",
+               Style);
+  Style.BreakAfterAttributes = FormatStyle::ABS_Leave;
+
+  Style.BreakTemplateDeclarations = FormatStyle::BTDS_Yes;
+  verifyFormat("template <typename T>\n"
+               "static inline\n"
+               "T f();",
+               Style);
+  verifyFormat("template <typename T>\n"
+               "  requires Foo<T>\n"
+               "static inline\n"
+               "T f();",
+               Style);
+  Style.BreakTemplateDeclarations = FormatStyle::BTDS_Leave;
+
+  Style.BreakBeforeReturnType = FormatStyle::BBRTS_AllDefinitions;
+  verifyFormat("class A {\n"
+               "  static inline int member();\n"
+               "  static inline\n"
+               "  int member_def() {\n"
+               "    return 0;\n"
+               "  }\n"
+               "};\n"
+               "static inline int top_decl();\n"
+               "static inline\n"
+               "int top_defn() {\n"
+               "  ++x;\n"
+               "  return 0;\n"
+               "}",
+               Style);
+
+  Style.BreakBeforeReturnType = FormatStyle::BBRTS_TopLevel;
+  verifyFormat("class A {\n"
+               "  static inline int member();\n"
+               "  static inline int member_def() { return 0; }\n"
+               "};\n"
+               "static inline\n"
+               "int top_decl();\n"
+               "static inline\n"
+               "int top_defn() {\n"
+               "  ++x;\n"
+               "  return 0;\n"
+               "}",
+               Style);
+
+  Style.BreakBeforeReturnType = FormatStyle::BBRTS_TopLevelDefinitions;
+  verifyFormat("class A {\n"
+               "  static inline int member();\n"
+               "  static inline int member_def() { return 0; }\n"
+               "};\n"
+               "static inline int top_decl();\n"
+               "static inline\n"
+               "int top_defn() {\n"
+               "  ++x;\n"
+               "  return 0;\n"
+               "}",
+               Style);
+
+  Style.BreakBeforeReturnType = FormatStyle::BBRTS_All;
+
+  Style.AttributeMacros = {"__always_inline"};
+  verifyFormat("__always_inline\n"
+               "void f(void);",
+               Style);
+
+  Style.AttributeMacros = {"__always_inline", "LIBC_INLINE"};
+  verifyFormat("LIBC_INLINE static __always_inline\n"
+               "int compute(int x);",
+               Style);
+
+  Style.AttributeMacros = {"ATTRIBUTE_PRINTF"};
+  verifyFormat("ATTRIBUTE_PRINTF(1, 2) static\n"
+               "void log(const char *fmt, ...);",
+               Style);
+
+  // Same identifier: unconfigured -> not a specifier; configured -> specifier.
+  Style.AttributeMacros = {};
+  verifyFormat("FOO static void f(void);", Style);
+  Style.AttributeMacros = {"FOO"};
+  verifyFormat("FOO static\n"
+               "void f(void);",
+               Style);
+
+  Style.AttributeMacros = {"LIBC_INLINE"};
+  verifyFormat("[[nodiscard]] __attribute__((pure)) LIBC_INLINE static\n"
+               "int hash(int k);",
+               Style);
+}
+
 TEST_F(FormatTest, AlwaysBreakBeforeMultilineStrings) {
   FormatStyle NoBreak = getLLVMStyle();
   NoBreak.AlwaysBreakBeforeMultilineStrings = false;

From 2045ee5f2922a2619e9c70f010df3bf32b8299ca Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey@raspberryginger.com>
Date: Thu, 14 May 2026 08:03:52 +0000
Subject: [PATCH 39/95] Add new libc GH team to CODEOWNERS (#197630)

This auto-assigns PR reviewers, per the GitHub documentation.
---
 .github/CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index deac961a1b025..6599dea8a3b34 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -11,6 +11,7 @@
 # See https://llvm.org/docs/DeveloperPolicy.html#maintainers as well as the
 # Maintainers.* files in the the respective subproject directories.
 
+/libc/ @llbm/reviewers-libc
 /libcxx/ @llvm/reviewers-libcxx
 /libcxxabi/ @llvm/reviewers-libcxxabi
 /libunwind/ @llvm/reviewers-libunwind

From e1135dc2bf175c975781629bdae5982fefbc51a9 Mon Sep 17 00:00:00 2001
From: Alex Duran <alejandro.duran@intel.com>
Date: Thu, 14 May 2026 10:07:19 +0200
Subject: [PATCH 40/95] [OFFLOAD][L0] Simplify kernel setGroups logic (#197411)

This code path is not really used with upstream code generation.
---
 .../level_zero/include/L0Kernel.h             |  51 ----
 .../level_zero/src/L0Kernel.cpp               | 232 +-----------------
 2 files changed, 12 insertions(+), 271 deletions(-)

diff --git a/offload/plugins-nextgen/level_zero/include/L0Kernel.h b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
index 4115213b00f49..686c038b33a7f 100644
--- a/offload/plugins-nextgen/level_zero/include/L0Kernel.h
+++ b/offload/plugins-nextgen/level_zero/include/L0Kernel.h
@@ -23,32 +23,6 @@ namespace llvm::omp::target::plugin {
 class L0DeviceTy;
 class L0ProgramTy;
 
-/// Loop descriptor.
-struct TgtLoopDescTy {
-  int64_t Lb = 0;     // The lower bound of the i-th loop.
-  int64_t Ub = 0;     // The upper bound of the i-th loop.
-  int64_t Stride = 0; // The stride of the i-th loop.
-
-  bool operator==(const TgtLoopDescTy &other) const {
-    return Lb == other.Lb && Ub == other.Ub && Stride == other.Stride;
-  }
-};
-
-struct TgtNDRangeDescTy {
-  int32_t NumLoops = 0;      // Number of loops/dimensions.
-  int32_t DistributeDim = 0; // Dimensions lower than this one
-                             // must end up in one WG.
-  TgtLoopDescTy Levels[3];   // Up to 3 loops.
-
-  bool operator==(const TgtNDRangeDescTy &other) const {
-    return NumLoops == other.NumLoops && DistributeDim == other.DistributeDim &&
-           std::equal(Levels, Levels + 3, other.Levels);
-  }
-  bool operator!=(const TgtNDRangeDescTy &other) const {
-    return !(*this == other);
-  }
-};
-
 /// Forward declaration.
 struct L0LaunchEnvTy;
 
@@ -59,26 +33,9 @@ struct KernelPropertiesTy {
   uint32_t MaxThreadGroupSize = 0;
   uint32_t NumKernelArgs = 0;
   std::unique_ptr<uint32_t[]> ArgSizes;
-
-  /// Cached input parameters used in the previous launch.
-  int32_t NumTeams = -1;
-  int32_t ThreadLimit = -1;
-
-  /// Cached parameters used in the previous launch.
   ze_kernel_indirect_access_flags_t IndirectAccessFlags =
       std::numeric_limits<decltype(IndirectAccessFlags)>::max();
-  uint32_t GroupSizes[3] = {0, 0, 0};
-  ze_group_count_t GroupCounts{0, 0, 0};
-
   std::mutex Mtx;
-
-  /// Check if we can reuse group parameters.
-  bool reuseGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
-                        uint32_t *GroupSizesOut, L0LaunchEnvTy &KEnv) const;
-
-  /// Update cached group parameters.
-  void cacheGroupParams(const int32_t NumTeamsIn, const int32_t ThreadLimitIn,
-                        const uint32_t *GroupSizesIn, L0LaunchEnvTy &KEnv);
 };
 
 struct L0LaunchEnvTy {
@@ -102,10 +59,6 @@ class L0KernelTy : public GenericKernelTy {
   // Kernel Properties.
   mutable KernelPropertiesTy Properties;
 
-  void decideKernelGroupArguments(L0DeviceTy &Device, uint32_t NumTeams,
-                                  uint32_t ThreadLimit, uint32_t *GroupSizes,
-                                  L0LaunchEnvTy &KEnv) const;
-
   Error buildKernel(L0ProgramTy &Program);
   Error readKernelProperties(L0ProgramTy &Program);
 
@@ -143,10 +96,6 @@ class L0KernelTy : public GenericKernelTy {
   }
 
   ze_kernel_handle_t getZeKernel() const { return zeKernel; }
-
-  Error getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
-                       int32_t ThreadLimit, uint32_t *GroupSizes,
-                       L0LaunchEnvTy &KEnv) const;
 };
 
 } // namespace llvm::omp::target::plugin
diff --git a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
index 4a13637d2f0ce..8c4766a1b46e0 100644
--- a/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
+++ b/offload/plugins-nextgen/level_zero/src/L0Kernel.cpp
@@ -19,28 +19,6 @@
 
 namespace llvm::omp::target::plugin {
 
-bool KernelPropertiesTy::reuseGroupParams(const int32_t NumTeamsIn,
-                                          const int32_t ThreadLimitIn,
-                                          uint32_t *GroupSizesOut,
-                                          L0LaunchEnvTy &KEnv) const {
-  if (NumTeamsIn != NumTeams || ThreadLimitIn != ThreadLimit)
-    return false;
-  // Found matching input parameters.
-  std::copy_n(GroupSizes, 3, GroupSizesOut);
-  KEnv.GroupCounts = GroupCounts;
-  return true;
-}
-
-void KernelPropertiesTy::cacheGroupParams(const int32_t NumTeamsIn,
-                                          const int32_t ThreadLimitIn,
-                                          const uint32_t *GroupSizesIn,
-                                          L0LaunchEnvTy &KEnv) {
-  NumTeams = NumTeamsIn;
-  ThreadLimit = ThreadLimitIn;
-  std::copy_n(GroupSizesIn, 3, GroupSizes);
-  GroupCounts = KEnv.GroupCounts;
-}
-
 Error L0KernelTy::readKernelProperties(L0ProgramTy &Program) {
   const auto &l0Device = L0DeviceTy::makeL0Device(Program.getDevice());
   auto &KernelPR = getProperties();
@@ -107,167 +85,6 @@ Error L0KernelTy::initImpl(GenericDeviceTy &GenericDevice,
   return Plugin::success();
 }
 
-void L0KernelTy::decideKernelGroupArguments(L0DeviceTy &Device,
-                                            uint32_t NumTeams,
-                                            uint32_t ThreadLimit,
-                                            uint32_t *GroupSizes,
-                                            L0LaunchEnvTy &KEnv) const {
-
-  const KernelPropertiesTy &KernelPR = getProperties();
-
-  const auto DeviceId = Device.getDeviceId();
-  bool MaxGroupSizeForced = false;
-  bool MaxGroupCountForced = false;
-  uint32_t MaxGroupSize = Device.getMaxGroupSize();
-  const auto &Option = Device.getPlugin().getOptions();
-  const auto OptSubscRate = Option.SubscriptionRate;
-  auto &GroupCounts = KEnv.GroupCounts;
-
-  uint32_t SIMDWidth = KernelPR.SIMDWidth;
-  uint32_t KernelWidth = KernelPR.Width;
-  uint32_t KernelMaxThreadGroupSize = KernelPR.MaxThreadGroupSize;
-
-  if (KernelMaxThreadGroupSize < MaxGroupSize) {
-    MaxGroupSize = KernelMaxThreadGroupSize;
-    INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-         "Capping maximum team size to %" PRIu32
-         " due to kernel constraints.\n",
-         MaxGroupSize);
-  }
-
-  if (ThreadLimit > 0) {
-    MaxGroupSizeForced = true;
-    MaxGroupSize = ThreadLimit;
-  }
-
-  uint32_t MaxGroupCount = 0;
-  if (NumTeams > 0) {
-    MaxGroupCount = NumTeams;
-    MaxGroupCountForced = true;
-  }
-
-  if (MaxGroupCountForced) {
-    // If number of teams is specified by the user, then use KernelWidth.
-    // WIs per WG by default, so that it matches
-    // decideLoopKernelGroupArguments() behavior.
-    if (!MaxGroupSizeForced) {
-      MaxGroupSize = KernelWidth;
-    }
-  } else {
-    const uint32_t NumSubslices = Device.getNumSubslices();
-    uint32_t NumThreadsPerSubslice = Device.getNumThreadsPerSubslice();
-    if (KEnv.HalfNumThreads)
-      NumThreadsPerSubslice /= 2;
-
-    MaxGroupCount = NumSubslices * NumThreadsPerSubslice;
-    if (MaxGroupSizeForced) {
-      // Set group size for the HW capacity.
-      uint32_t NumThreadsPerGroup = (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
-      uint32_t NumGroupsPerSubslice =
-          (NumThreadsPerSubslice + NumThreadsPerGroup - 1) / NumThreadsPerGroup;
-      MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
-    } else {
-      assert(!MaxGroupSizeForced && !MaxGroupCountForced);
-      assert((MaxGroupSize <= KernelWidth || MaxGroupSize % KernelWidth == 0) &&
-             "Invalid maxGroupSize");
-      // Maximize group size.
-      while (MaxGroupSize >= KernelWidth) {
-        uint32_t NumThreadsPerGroup =
-            (MaxGroupSize + SIMDWidth - 1) / SIMDWidth;
-
-        if (NumThreadsPerSubslice % NumThreadsPerGroup == 0) {
-          uint32_t NumGroupsPerSubslice =
-              NumThreadsPerSubslice / NumThreadsPerGroup;
-          MaxGroupCount = NumGroupsPerSubslice * NumSubslices;
-          break;
-        }
-        MaxGroupSize -= KernelWidth;
-      }
-    }
-  }
-
-  uint32_t GRPCounts[3] = {MaxGroupCount, 1, 1};
-  uint32_t GRPSizes[3] = {MaxGroupSize, 1, 1};
-  if (!MaxGroupCountForced) {
-    GRPCounts[0] *= OptSubscRate;
-  }
-  GroupCounts.groupCountX = GRPCounts[0];
-  GroupCounts.groupCountY = GRPCounts[1];
-  GroupCounts.groupCountZ = GRPCounts[2];
-  std::copy(GRPSizes, GRPSizes + 3, GroupSizes);
-}
-
-Error L0KernelTy::getGroupsShape(L0DeviceTy &Device, int32_t NumTeams,
-                                 int32_t ThreadLimit, uint32_t *GroupSizes,
-                                 L0LaunchEnvTy &KEnv) const {
-
-  const auto DeviceId = Device.getDeviceId();
-  const auto &KernelPR = getProperties();
-
-  // Read the most recent global thread limit and max teams.
-  const int32_t NumTeamsICV = 0;
-  const int32_t ThreadLimitICV = 0;
-
-  bool IsXeHPG = Device.isDeviceArch(DeviceArchTy::DeviceArch_XeHPG);
-  KEnv.HalfNumThreads =
-      Device.getPlugin().getOptions().ZeDebugEnabled && IsXeHPG;
-  uint32_t KernelWidth = KernelPR.Width;
-  uint32_t SIMDWidth = KernelPR.SIMDWidth;
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Assumed kernel SIMD width is %" PRIu32 "\n", SIMDWidth);
-  INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-       "Preferred team size is multiple of %" PRIu32 "\n", KernelWidth);
-  assert(SIMDWidth <= KernelWidth && "Invalid SIMD width.");
-
-  if (ThreadLimit > 0) {
-    // use thread_limit clause value default.
-    ODBG(OLDT_Kernel) << "Max team size is set to " << ThreadLimit
-                      << " (thread_limit clause)";
-  } else if (ThreadLimitICV > 0) {
-    // else use thread-limit-var ICV.
-    ThreadLimit = ThreadLimitICV;
-    ODBG(OLDT_Kernel) << "Max team size is set to " << ThreadLimit
-                      << " (thread-limit-icv)";
-  }
-
-  size_t MaxThreadLimit = Device.getMaxGroupSize();
-  // Set correct max group size if the kernel was compiled with explicit SIMD.
-  if (SIMDWidth == 1)
-    MaxThreadLimit = Device.getNumThreadsPerSubslice();
-
-  if (KernelPR.MaxThreadGroupSize < MaxThreadLimit) {
-    MaxThreadLimit = KernelPR.MaxThreadGroupSize;
-    ODBG(OLDT_Kernel) << "Capping maximum team size to " << MaxThreadLimit
-                      << " due to kernel constraints.";
-  }
-
-  if (ThreadLimit > static_cast<int32_t>(MaxThreadLimit)) {
-    ThreadLimit = MaxThreadLimit;
-    ODBG(OLDT_Kernel) << "Max team size exceeds current maximum "
-                      << MaxThreadLimit << ". Adjusted";
-  }
-  // scope code to ease integration with downstream custom code.
-  {
-    if (NumTeams > 0) {
-      ODBG(OLDT_Kernel) << "Number of teams is set to " << NumTeams
-                        << " (num_teams clause or no teams construct)";
-    } else if (NumTeamsICV > 0) {
-      // OMP_NUM_TEAMS only matters, if num_teams() clause is absent.
-      INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
-           "OMP_NUM_TEAMS(%" PRId32 ") is ignored\n", NumTeamsICV);
-
-      NumTeams = NumTeamsICV;
-      ODBG(OLDT_Kernel) << "Max number of teams is set to " << NumTeams
-                        << " (OMP_NUM_TEAMS)";
-    }
-
-    decideKernelGroupArguments(Device, (uint32_t)NumTeams,
-                               (uint32_t)ThreadLimit, GroupSizes, KEnv);
-  }
-
-  return Plugin::success();
-}
-
 static Error launchKernelWithImmCmdList(L0DeviceTy &l0Device,
                                         ze_kernel_handle_t zeKernel,
                                         L0LaunchEnvTy &KEnv,
@@ -379,41 +196,18 @@ static Error launchKernelWithCmdQueue(L0DeviceTy &l0Device,
 Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
                                   uint32_t NumThreads[3],
                                   uint32_t NumBlocks[3]) const {
-
-  bool HasUserDefinedGroups = NumThreads[0] != 0 && NumThreads[1] != 0 &&
-                              NumThreads[2] != 0 && NumBlocks[0] != 0 &&
-                              NumBlocks[1] != 0 && NumBlocks[2] != 0;
+  assert(NumThreads[0] > 0 && NumThreads[1] > 0 && NumThreads[2] > 0 &&
+         "Pre-computed ThreadLimit values must be non-zero");
+  assert(NumBlocks[0] > 0 && NumBlocks[1] > 0 && NumBlocks[2] > 0 &&
+         "Pre-computed NumTeams values must be non-zero");
 
   uint32_t GroupSizes[3];
-  bool CanReuseParams = false;
-
-  if (HasUserDefinedGroups) {
-    KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]};
-    // Respect max group size attribute in the kernel.
-    uint32_t MaxGroupSize = KEnv.KernelPR.MaxThreadGroupSize;
-    GroupSizes[0] = std::min<uint32_t>(MaxGroupSize, NumThreads[0]);
-    GroupSizes[1] = std::min<uint32_t>(MaxGroupSize, NumThreads[1]);
-    GroupSizes[2] = std::min<uint32_t>(MaxGroupSize, NumThreads[2]);
-  } else {
-    int32_t NumTeams = NumBlocks[0];
-    int32_t ThreadLimit = NumThreads[0];
-    if (NumTeams < 0)
-      NumTeams = 0;
-    if (ThreadLimit < 0)
-      ThreadLimit = 0;
-
-    auto &KernelPR = KEnv.KernelPR;
-    // Check if we can reuse previous group parameters.
-    CanReuseParams =
-        KernelPR.reuseGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
-
-    if (!CanReuseParams) {
-      if (auto Err =
-              getGroupsShape(l0Device, NumTeams, ThreadLimit, GroupSizes, KEnv))
-        return Err;
-      KernelPR.cacheGroupParams(NumTeams, ThreadLimit, GroupSizes, KEnv);
-    }
-  }
+  KEnv.GroupCounts = {NumBlocks[0], NumBlocks[1], NumBlocks[2]};
+  // Respect max group size attribute in the kernel.
+  uint32_t MaxGroupSize = KEnv.KernelPR.MaxThreadGroupSize;
+  GroupSizes[0] = std::min<uint32_t>(MaxGroupSize, NumThreads[0]);
+  GroupSizes[1] = std::min<uint32_t>(MaxGroupSize, NumThreads[1]);
+  GroupSizes[2] = std::min<uint32_t>(MaxGroupSize, NumThreads[2]);
 
   auto DeviceId = l0Device.getDeviceId();
   INFO(OMP_INFOTYPE_PLUGIN_KERNEL, DeviceId,
@@ -424,10 +218,8 @@ Error L0KernelTy::setKernelGroups(L0DeviceTy &l0Device, L0LaunchEnvTy &KEnv,
        KEnv.GroupCounts.groupCountX, KEnv.GroupCounts.groupCountY,
        KEnv.GroupCounts.groupCountZ);
 
-  if (!CanReuseParams) {
-    CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
-                      GroupSizes[1], GroupSizes[2]);
-  }
+  CALL_ZE_RET_ERROR(zeKernelSetGroupSize, getZeKernel(), GroupSizes[0],
+                    GroupSizes[1], GroupSizes[2]);
 
   return Plugin::success();
 }

From 0c539fc057b2e838dc30c7e3229110d0c0e168c8 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 14 May 2026 09:24:33 +0100
Subject: [PATCH 41/95] [compiler-rt][ARM] Optimized double-precision FP
 mul/div (#179923)

Optimized AArch32 implementations of `muldf3` and `divdf3` are provided.
The division function is particularly tricky because its Newton-Raphson
approximation strategy requires a rigorous error bound. In this version
of the commit I've left out the full supporting machinery that validates
the error bound via Gappa and Rocq, but full details are provided via
links to the upstream version of this code in the Arm Optimized Routines
repository, and to a pair of Arm Community blog posts.
---
 compiler-rt/lib/builtins/CMakeLists.txt       |   2 +
 compiler-rt/lib/builtins/arm/divdf3.S         | 646 +++++++++++++
 compiler-rt/lib/builtins/arm/muldf3.S         | 404 ++++++++
 .../test/builtins/Unit/divdf3new_test.c       | 862 ++++++++++++++++++
 .../test/builtins/Unit/muldf3new_test.c       | 832 +++++++++++++++++
 5 files changed, 2746 insertions(+)
 create mode 100644 compiler-rt/lib/builtins/arm/divdf3.S
 create mode 100644 compiler-rt/lib/builtins/arm/muldf3.S
 create mode 100644 compiler-rt/test/builtins/Unit/divdf3new_test.c
 create mode 100644 compiler-rt/test/builtins/Unit/muldf3new_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index f0c2f3ea5ee43..c7e50c714845a 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -469,6 +469,8 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
       arm/mulsf3.S
       arm/divsf3.S
       arm/adddf3.S
+      arm/muldf3.S
+      arm/divdf3.S
       )
     set_source_files_properties(${assembly_files}
       PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
diff --git a/compiler-rt/lib/builtins/arm/divdf3.S b/compiler-rt/lib/builtins/arm/divdf3.S
new file mode 100644
index 0000000000000..58a9e2690efd3
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/divdf3.S
@@ -0,0 +1,646 @@
+//===-- divdf3.S - double-precision floating point division ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the __divdf3 function (double precision floating point
+// division), with the IEEE-754 default rounding (to nearest, ties to even),
+// for the Arm and Thumb2 ISAs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+#include "crt_endian.h"
+
+// The basic strategy of this division code is to use Newton-Raphson iteration
+// to calculate an approximation to 1/y, then multiply it by x. This procedure
+// delivers a quotient with 10 extra bits of precision, but which isn't exact.
+// We know an upper bound on its possible error, which gives an interval of
+// possible values for the true quotient. So we can check the 10 extra bits to
+// see whether a rounding boundary lies within the interval. If not, then we
+// can round and return without worrying further; otherwise, we go to slower
+// correction code that multiplies the approximate quotient back up by y and
+// checks it against x.
+//
+// This strategy depends critically on the upper bound on the approximation
+// error. Underestimating the error introduces a bug; overestimating it costs
+// performance, by sending more cases than necessary to the slow path.
+//
+// To give high confidence of its correctness, the upper bound has been proved
+// formally by Gappa. The Gappa proof and auxiliary code are not included in
+// this version, but they can be found in the Arm Optimized Routines repository
+//
+// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/at32/ddiv.S
+// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-prove.py
+// https://github.com/ARM-software/optimized-routines/blob/bf3e44c3784dd3e18d3d5232e13b4d81f232310b/fp/auxiliary/ddiv-diagnostics.c
+//
+// and a pair of blog posts describing the concepts and procedure are here:
+//
+// https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p1
+// https://developer.arm.com/community/arm-community-blogs/b/embedded-and-microcontrollers-blog/posts/formally-verifying-a-floating-point-division-routine-with-gappa-p2
+
+  .syntax unified
+  .text
+  .p2align 2
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__divdf3)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __aeabi_ddiv
+  VMOV_TO_DOUBLE(d0, r0, r1)
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__divdf3, __aeabi_ddiv)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_ddiv)
+
+  push    {r4,r5,r6,r7,r8,lr}
+
+  // Check if either input exponent 7FF (infinity or NaN), and if so, branch
+  // out of line.
+  ldr     r12, =0x07FF0000        // mask for exponent cold storage
+  bics    r4, r12, xh, lsr #4     // test for Infs or NaNs
+  bicsne  r4, r12, yh, lsr #4
+  beq     LOCAL_LABEL(ddiv_naninf)
+
+  // Extract the exponents of the input values x and y into bits 16..26 of r14
+  // and r5 respectively, and in the process, check if either exponent is zero
+  // (so that one or both inputs are 0 or denormal). In order to combine the
+  // two tests, the second ANDS is performed conditionally, so that if x's
+  // exponent is zero then the out-of-line code at ddiv_zerodenorm might find
+  // y's exponent hasn't been set up yet.
+  //
+  // We also calculate the sign of the result, which will be needed whether or
+  // not we branch. This is saved in the low bit of r4.
+  ands    r4, r12, xh, lsr #4     // get exponent of x, setting Z if it's 0
+  andsne  r5, r12, yh, lsr #4     // if not, extract and test exponent of y
+  eor     r6, xh, yh              // XOR the input signs to get the result sign
+  orr     r4, r4, r6, lsr #31     // save it in the low bit of r4
+  beq     LOCAL_LABEL(ddiv_zerodenorm)         // branch out of line for zeroes or denormals
+
+  // Calculate the initial exponent of the result, by subtracting the two input
+  // exponents and adjusting for the IEEE exponent bias. This value may have to
+  // be adjusted by 1 later, depending on the quotient of the mantissas.
+  //
+  // If we branched to ddiv_zerodenorm above, and it found denormals but no
+  // zeroes, it may branch back here after renormalising them. We expect the
+  // out-of-line code to have left the exponent difference in the top half of
+  // r4 (still with the output sign in the low bit), but not yet to have
+  // applied the bias. So it branches back in immediately after the SUB.
+  //
+  // The exponent bias we want is either 0x3fe or 0x3ff, depending on whether
+  // we have to shift the output mantissa by 1 below. Neither of those values
+  // fits in the immediate field of an ADD instruction, so we must use two
+  // instructions.
+  sub     r4, r4, r5
+LOCAL_LABEL(ddiv_normalised): // denormal handler will come back to here
+  add     r4, r4, #0x03FC0000     // add the 8 high bits of the bias 0x3FE
+  add     r4, r4, #0x00020000     // add the remaining bit of the bias
+
+  // Shift both mantissas up to the top of their 64-bit register pair, and OR
+  // in the leading 1 bit, which will occupy the high bit of the high word in
+  // each case.
+  mov     r5, #(1 << 31)          // high bit for ORing in to both mantissas
+  orr     xh, r5, xh, lsl #11     // shift up xh and OR in the high bit
+  orr     yh, r5, yh, lsl #11     // same for yh
+  orr     xh, xh, xl, lsr #21     // OR in the bits shifted out of xl into xh
+  orr     yh, yh, yl, lsr #21     // same for yl and yh
+  lsl     xl, xl, #11             // shift up the rest of xl
+  lsl     yl, yl, #11             // same for yl
+
+  // Check if the two mantissas are exactly equal, so that the quotient is
+  // exactly a power of 2. If so, branch out of line to handle that case
+  // specially.
+  //
+  // This guarantees that when we examine the approximate quotient afterwards,
+  // we can't be confused about whether it needs to be renormalised, which
+  // would otherwise cost just as much effort as this check. Our reciprocal
+  // approximation is always an underestimate (that's in the nature of this
+  // particular Newton-Raphson iteration), so if x < y (meaning the mantissas
+  // rather than the whole floats) then even the true quotient will be less
+  // than 1, and the approximation even more so. On the other hand, if x > y,
+  // then the true quotient will be enough greater than 1 that even the largest
+  // possible error in the approximation can't make it look like less than 1.
+  //
+  // (Proof: regard x,y as normalised to the range [1,2). If x > y, then we
+  // have x ≥ y+ε, where ε is the machine epsilon. So x/y ≥ 1+ε/y > 1+ε/2. And
+  // the bound on the approximation error, given below, is far less than ε/2.)
+  cmp     xh, yh
+  cmpeq   xl, yl
+  beq     LOCAL_LABEL(ddiv_result_is_power_of_2)
+
+  // Now we begin the actual calculation of the reciprocal approximation.
+  //
+  // We begin with our two input mantissas stored in xh:xl and yh:yl, each with
+  // its leading 1 explicit and shifted up to the top of the word. So they can
+  // be regarded as 64-bit integers with the high bit set and the bottom 11
+  // bits clear.
+
+  // Obtain an 8-bit reciprocal approximation by using the topmost 8 bits of y
+  // as a lookup table. The top bit of y is always set, so there are only 128
+  // lookup table entries, not 256. The 8-bit value we load also has its top
+  // bit set.
+  lsr     r5, yh, #24           // r5 is the table index plus 0x80
+
+  // Get the address of reciptbl, in various ways depending on position-
+  // independence and Arm/Thumb state.
+  //
+  // Since the table index calculated above in r5 includes the high mantissa
+  // bit, an index of 0x80 refers to the first table entry and 0xFF the last.
+  // So we subtract 0x80 from the table address to compensate.
+#if defined __pic__ || defined __PIC__ || defined __ARM_ROPI
+  // In PIC or ROPI modes, we must construct the address in a pc-relative
+  // manner, by making a literal containing the offset from the current code.
+  // The reference point for that offset is the value of pc as read by the add
+  // instruction at get_reciptbl below, which will be 4 or 8 bytes after it in
+  // Thumb or Arm state respectively.
+#if __thumb__
+  ldr     r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+4)
+#else
+  ldr     r6, =(LOCAL_LABEL(reciptbl)-0x80) - (LOCAL_LABEL(get_reciptbl)+8)
+#endif
+LOCAL_LABEL(get_reciptbl):
+  add     r6, r6, pc
+#else
+  // If we're not building for position independence, we can just load the
+  // target address directly.
+  ldr     r6, =(LOCAL_LABEL(reciptbl)-0x80)
+#endif
+
+  ldrb    r6, [r6, r5]          // and load the approximation into r6
+
+  // First Newton-Raphson iteration, which expands that 8-bit approximation to
+  // a 17-bit one, again with its top bit set. We use the top 16 bits of y for
+  // this, so that we can fit the multiplications into ordinary MUL rather than
+  // UMULL.
+  //
+  // The Newton-Raphson formula to turn an approximation r ≈ 1/y into a better
+  // one is r → r(2-yr). In this case we're scaling up to integers (informal
+  // fixed point), so the 2 becomes 2^24.
+  lsr     r5, yh, #16           // get top halfword of y
+  mul     r7, r6, r5            // multiply it by the input value r
+  rsb     r7, r7, #(1 << 24)    // subtract from 2 (scaled up appropriately)
+  mul     r7, r6, r7            // multiply again to make r(2-yr)
+  lsr     r7, r7, #14           // shift down to keep only 17 bits of it
+
+  // Second iteration, expanding into a 32-bit reciprocal, using the top 31
+  // bits of y (i.e. yh shifted by 1). The first multiplication (making yr) is
+  // 32x32 → 64 bits, so we use a single UMULL; the second one making r(2-yr)
+  // is 32x64, which we do with a UMULL by the bottom half of yr and then MLA
+  // by the top half, so we only keep the low 64 bits of the full answer.
+  //
+  // The subtraction from 2 (again scaled up, this time to 2^48) is done by
+  // RSBS+RSC, interleaved with the multiplications so as to use a delay slot
+  // on CPUs that have one.
+  lsr     r12, yh, #1
+  umull   r6, r8, r7, r12       // r8:r6 = yr
+  rsbs    r6, r6, #0            // low half of subtraction from 2
+  umull   r12, lr, r7, r6       // multiply r by the low half of 2-yr
+#if __thumb__
+  // Thumb has no RSC, so simulate it by bitwise inversion and then ADC
+  mvn     r8, r8
+  adc     r8, r8, #(1 << 16)
+#else
+  rsc     r8, r8, #(1 << 16)    // high half of subtraction from 2
+#endif
+  mla     r6, r7, r8, lr        // multiply r by the high half of 2-yr
+
+  // Third iteration, expanding into a 64-bit reciprocal, with the leading bit
+  // expected to end up in bit 60. Now the first multiplication to make yr is
+  // 32x64 → 96 bits, so we put the product in three registers lr:r12:r8.
+  // However, we're going to discard the low word r8 completely, because it
+  // makes negligible difference. So we'll treat the output yr as 64-bit.
+  umull   r8, r12, r6, yl       // multiply r by bottom half of y
+  mov     lr, #0                // initialize high word to 0
+  umlal   r12, lr, r6, yh       // multiply r by top half of y
+  // Subtract from a power of 2, as usual. But in this case the power of 2
+  // we're subtracting from is 2^64, which is just off the top of the 64-bit
+  // value in lr:r12. So in fact we're just negating the whole thing!
+  //
+  // To preserve the invariant that the approximation error is always negative,
+  // we negate via one's complement rather than two's. (This would only make a
+  // difference if r8 had happened to be exactly 0. That in turn can occur when
+  // yl=0, so one of the test cases in ddiv-diagnostics.c deliberately uses
+  // such a value, so that the intermediate results can be checked against the
+  // reference Python.)
+  mvn     r12, r12
+  mvn     lr, lr
+  // Now lr:r12:r8 contains 2-yr. We discard the low word r8 to reduce that to
+  // 64 bits, and do another 32x64 → 96 bit multiplication.
+  umull   r5, r8, r6, r12      // multiply r by bottom half of 2-yr
+  mov     r7, #0               // initialize high word to 0
+  umlal   r8, r7, r6, lr       // multiply r by top half of 2-yr
+
+  // That's the Newton-Raphson iteration done: we have a 64-bit approximation
+  // to 1/y. Multiply it by x to get the full approximate quotient.
+  //
+  // In principle, this would be a 64x64 → 128 bit multiplication, involving
+  // four long multiply instructions. But we only need the top 64 bits, and
+  // we're already prepared to tolerate some error in the calculations, so we
+  // cut corners: don't multiply the two low words together at all, and we
+  // discard the bottom half of each of the (low * high) partial products
+  // without bothering to propagate carries out of it.
+  //
+  // (All of these shortcuts are faithfully mimicked in the Python reference
+  // implementation which generates Gappa input, so they're all accounted for
+  // in the error analysis.)
+#if __ARM_FEATURE_DSP
+  umull   r12, r6, xh, r8      // r6 = high word of x * low word of 1/y
+  umull   r12, r5, xl, r7      // r5 = low word of x * high word of 1/y
+  umaal   r6, r5, xh, r7       // add those to the product of both high words
+#else
+  // Alternative instruction sequence using UMLAL, if UMAAL isn't available
+  umull   r12, r6, xh, r8      // r6 = high word of x * low word of 1/y
+  umull   r12, lr, xl, r7      // lr = low word of x * high word of 1/y
+  adds    r6, r6, lr           // add those together
+  mov     r5, #0               // set r5 to the carry out of that addition
+  adc     r5, r5, #0
+  umlal   r6, r5, xh, r7       // add that to the product of both high words
+#endif
+  // Now r5:r6 is the completed approximate quotient, with its leading bit at
+  // position either 61 or 62.
+
+  // Normalize so that the leading bit is always in bit 60, by shifting left if
+  // it isn't there already, and adjusting the output exponent by 1 to
+  // compensate.
+  //
+  // We do the test in a slightly tricky way, by arranging to set the V flag if
+  // the leading bit is in bit 60. This allows us to do the left shift under
+  // the VC condition, which is convenient because the LSLS instruction that
+  // shifts the low word left moves the top bit into the C flag without
+  // affecting V.
+  //
+  // We also save the value written into lr by the initial ADDS instruction,
+  // because that contains enough information to tell us whether we
+  // renormalised here. The correction path for quotients too close to a
+  // rounding boundary will need to recover that information.
+  adds    lr, r5, #0x40000000  // set V flag if bit 62 of high word set
+  subvc   r4, r4, #(1 << 16)   // if not, correct the exponent by 1,
+  lslsvc  r6, r6, #1           // shift the low word of the quotient left
+  adcvc   r5, r5, r5           // and shift its top bit into the high word
+
+  // Now r5:r6 is the _normalised_ approximate quotient, with its leading bit
+  // reliably in bit 60. This is the final output of the calculation that the
+  // Gappa error-analysis proof applies to.
+
+  // That 64-bit output has bit 63 clear; the leading 1 bit of the output
+  // mantissa in bit 62, followed by 52 more mantissa bits; then 10 bits at the
+  // bottom which are used for determining rounding.
+  //
+  // Compute the _approximately_ rounded-to-nearest output mantissa, by adding
+  // half a ULP and shifting down. If we don't go to the slow path, this is the
+  // correct output mantissa. (See fdiv.S for the proof that the round-to-even
+  // tiebreaking case can't occur in floating-point division.)
+  //
+  // We keep the original version of r6, containing the ten rounding bits, so
+  // that we can test it to see if we need the slow path.
+  adds    r7, r6, #(1 << 9)    // add half a ULP, copying low word into r7
+  adc     r5, r5, #0           // propagate carry into high word
+  lsr     r7, r7, #10          // shift low word right
+  orr     r7, r7, r5, lsl #22  // combine with bits shifted out of high word
+  lsr     r5, r5, #10          // shift high word right
+
+  // Now test r6 to see whether this output mantissa can be relied on, or
+  // whether the approximation landed too close to a rounding boundary.
+  //
+  // The maximum possible error in the approximation, taking into account the
+  // initial error in each lookup table entry, the remaining mathematical error
+  // introduced by stopping after this many Newton-Raphson iterations, and
+  // every shortcut, right shift, truncation and discarding of a partial
+  // product in the algorithm above, is always negative, and less than 64 units
+  // in the last place of the 64-bit approximate quotient. That is, the true
+  // quotient lies somewhere between the 64-bit integer described as "final
+  // output of the calculation" above, and that plus 64.
+  //
+  // So if the bottom 10 bits of r6 have the value 2^9 or greater, we're safe,
+  // because the true value is _larger_ than the approximation, so if the
+  // approximation is already above the rounding boundary then so is the true
+  // value. And if those 10 bits are (2^9-64) or less then we're also safe,
+  // because even if the true value is greater by 63, it's still on the same
+  // side of the rounding boundary.
+  //
+  // We check the error by subtracting (2^9-64), so that the dangerous values
+  // of the bottom 10 bits are those in the range 0,...,63, i.e. precisely
+  // those with none of bits 6,7,8,9 set.
+  //
+  // We also combine this test with a check for underflow, because that also
+  // needs more careful handling (the mantissa must be re-rounded to a
+  // different bit position, which involves knowing whether it's exact).
+  // Underflow has happened if the exponent in the top half of r4 is negative
+  // (it's off by 1 so that the leading mantissa bit will increment it), so we
+  // test by an ASR#31 (copying the top bit of r4 into all of it) and negating.
+  // That way, the output value is zero on underflow, matching the flags from
+  // the other check.
+  sub     r6, r6, #(1 << 9)-64
+  tst     r6, #0x3C0              // now EQ means we must go to the slow path
+  mvnsne  r12, r4, asr #31        // also set EQ if underflow has happened
+  beq     LOCAL_LABEL(ddiv_correction)         // branch out of line to do the hard bit
+
+  // If we do go to ddiv_correction, it branches back here after the correction
+  // code has finished. Either way, we expect that r5:r7 is the result
+  // mantissa, with the top bit set, already in the correct position in the
+  // word, and already rounded to nearest.
+LOCAL_LABEL(ddiv_corrected):
+  // Recombine the output mantissa with the sign and exponent.
+  add     xh, r5, r4, lsl #31     // add sign bit to top word of mantissa
+  bic     r12, r4, #1             // isolate exponent in top half of r4
+  add     xh, xh, r12, lsl #4     // add exponent to make the final high word
+  mov     xl, r7                  // move low word into the right register
+
+  // If there's no overflow or underflow, we're done.
+  //
+  // We _identified_ underflow above when we went to the slow path, but having
+  // done that, the slow path came back here, so we must check for it again.
+  // (The only purpose of the detour was to obtain accurate information about
+  // whether the quotient is exact, or needed rounding.)
+  //
+  // The output exponent, offset downwards by 1, is in the top half of r4. If
+  // it's negative, there's an underflow; if it's too large, there's an
+  // overflow. We do an approximate test for both at once via an unsigned
+  // comparison against 0x7f0, using r12 (the register in which we already
+  // cleared the sign bit stored at the bottom). This identifies _most_ normal
+  // outputs as quickly as possible.
+  //
+  // 0x7f0 isn't the maximum possible known-safe exponent, but it's the largest
+  // one that fits in the immediate field of CMP. We deal with the remaining
+  // cases in the next few instructions.
+  cmp     r12, #(0x7f0 << 16)
+  popls   {r4,r5,r6,r7,r8,pc}
+
+  // Now check the remaining cases more carefully.
+  //
+  // If r12 < 0 then we definitely have underflow. We detect overflow precisely
+  // by seeing if the _final_ output exponent (in the output register xh) is
+  // 0x7ff or more, by incrementing it and seeing if the sign is opposite from
+  // the intended output sign.
+  add     lr, xh, #(1 << 20)      // increment the output exponent field
+  teq     lr, r4, lsl #31         // set N if the sign now doesn't match r4[0]
+  tstpl   r12, r12                // otherwise, set N if underflow
+  poppl   {r4,r5,r6,r7,r8,pc}     // if neither, we've finished
+
+  // If we still haven't returned, we really do have overflow or underflow, and
+  // the sign of r12 tells us which.
+  tst     r12, r12
+  bmi     LOCAL_LABEL(ddiv_underflow)
+  // For overflow, correct the sign by biasing the exponent downward, and go to
+  // code that constructs an infinite return value (shared with the
+  // division-by-zero handler).
+  sub     xh, xh, #0x60000000
+  pop     {r4,r5,r6,r7,r8,lr}     // ddiv_retinf expects no regs on the stack
+  b       LOCAL_LABEL(ddiv_retinf)
+
+LOCAL_LABEL(ddiv_correction):
+  // The slow path, entered if the approximate quotient was too close to a
+  // rounding boundary to trust, and also if there's a chance of underflow (so
+  // that we can reliably determine the rounding direction, including whether
+  // the quotient was exact).
+  //
+  // Regarding the input mantissas x,y and our approximate quotient q as
+  // integers in [2^52,2^53), the quotient is an approximation to either
+  // x*2^52/y or x*2^53/y, depending on which of x,y was larger. We know that q
+  // is less than the true value of that quotient by at most a small fraction
+  // of a ULP. So the correct rounded quotient is either equal to q or to q+1,
+  // and we can decide which by multiplying back up by y: we want q - x*2^k/y
+  // to be in the range (-1/2,+1/2) (where k = 52 or 53), which is equivalent
+  // to asking if qy - x*2^k is in the range (-y/2,+y/2).
+  //
+  // That's a calculation we can do in integers using only addition and
+  // multiplication. And we know that if q itself doesn't have that property
+  // then q+1 will.
+
+  // The mantissa of y is currently right at the top of the word, which means
+  // that if the result of our check is greater than it, it will overflow. So
+  // we must start by shifting y downward. We'll put it back at the bottom of
+  // the word, where it was in the input float.
+  lsr     yl, yl, #11             // shift yl right
+  orr     yl, yl, yh, lsl #21     // OR in the bits shifted out of yh
+  lsr     yh, yh, #11             // shift yh right
+
+  // Compute the integer qy-x. Because q is already very close to the right
+  // quotient, we expect this to be an integer at most twice the size of y,
+  // which easily fits in 64 bits. So we don't need to compute the full 128-bit
+  // product: the low 64 bits are enough.
+  umull   r8, r6, r7, yl          // 64-bit product of the low words
+  mla     r6, r7, yh, r6          //   + (high word of y) * (low word of q)
+  mla     r6, r5, yl, r6          //   + (high word of q) * (low word of y)
+
+  // Now we must subtract either x << 53 or x << 52. This will only affect the
+  // high word of the product we've just computed. Also the mantissa of x is
+  // already shifted left by 11. So we shift xl left by either (52-32-11) or
+  // (53-32-11), i.e. by 9 or by 10, and subtract from the high word of the
+  // product.
+  //
+  // To decide which, we consult the value left in lr by the original test for
+  // renormalization, which added 0x40000000 to the high word of the initial
+  // approximate quotient 'quot'. If that had bit 62 set (so no renormalization
+  // needed) then the addition carried into the sign bit; otherwise it didn't.
+  // So lr is positive if and only if we need to shift xl left by an extra bit.
+  tst     lr, lr                  // did we renormalize?
+  subpl   r6, r6, xl, lsl #10     // if so, subtract x<<53 from q*y
+  submi   r6, r6, xl, lsl #9      // if not, subtract x<<52
+
+  // Now r6:r8 contains the residual value r = qy - x*2^k as described above.
+  // If this is between -y/2 and +y/2 then q is already the correctly rounded
+  // quotient. Otherwise, the correct quotient is q+1, so the value in r6:r8
+  // will be too small (incrementing q would add y to it). So we need to check
+  // whether r < -y/2, or equivalently whether 2r < -y (avoiding having to
+  // worry about what happens when we halve y if it's odd).
+  //
+  // As mentioned above, division can't give an exact halfway case, so we don't
+  // need to worry about the case r = y/2.
+  adds    r8, r8, r8              // multiply the residual by 2
+  adc     r6, r6, r6
+  adds    lr, r8, yl              // add y to it, discarding the result
+  adcs    lr, r6, yh
+  bpl     LOCAL_LABEL(ddiv_corrected)          // if the answer is positive, we're OK
+
+  // If we didn't take that branch, then the approximate quotient is too small
+  // by 1, so we must increment it. But also, we adjust the residual in r6:r8
+  // to match. That residual is unused by the main epilogue code, but we also
+  // came here for any underflowing value, and the underflow handler will need
+  // the exact residual to determine the rounding direction.
+  //
+  // (We could re-test whether underflow had happened and use that to skip the
+  // update of r6:r8, but the test would cost as much effort as it saved!)
+  adds    r7, r7, #1              // increment the output quotient
+  adcs    r5, r5, #0
+  adds    r8, r8, yl              // repeat the addition of y to the residual,
+  adcs    r6, r6, yh              //   this time keeping the result in r6:r8
+  b       LOCAL_LABEL(ddiv_corrected)          // finally we can rejoin the main code
+
+LOCAL_LABEL(ddiv_result_is_power_of_2):
+  // The special-case handler for the two input mantissas being equal, so that
+  // the result is an exact power of two. We set up all the output registers to
+  // the way the main code would have done it, and jump straight to
+  // ddiv_corrected. This includes setting r6:r8 to the 'residual' value
+  // computed by the slow path, in case this power-of-2 output is also an
+  // underflow, which will depend on those registers.
+  mov     r5, #0x00100000         // high word of quotient mantissa = 1<<20
+  mov     r7, #0                  // low word of quotient mantissa = 0
+  mov     r6, #0                  // high word of residual = 0
+  mov     r8, #0                  // low word of residual = 0
+  b       LOCAL_LABEL(ddiv_corrected)
+
+LOCAL_LABEL(ddiv_underflow):
+  // We come here to handle underflow. The output double, constructed naïvely
+  // from the out-of-range exponent, is in xh:xl. We expect in this situation
+  // that we've _always_ come via either the ddiv_correction slow path or the
+  // ddiv_result_is_power_of_2 special case, both of which will have set up a
+  // residual value in r6:r8 equal to q*y - x*2^k (for appropriate k). This
+  // value is positive if the quotient is slightly above the true value (i.e.
+  // was rounded up), or negative if the quotient was rounded down. But we must
+  // also distinguish the third case of the residual being exactly zero.
+  add     xh, xh, #0x60000000     // apply IEEE 754 exponent bias for __dunder
+  orrs    r12, r6, r8             // set r12=0 and Z=1 if quotient was exact
+  movne   r12, #1                 // otherwise, set r12 = +1
+  orrne   r12, r12, r6, asr #31   // and change to -1 if residual is negative
+  pop     {r4,r5,r6,r7,r8,lr}     // pop all locally saved registers
+  b       SYMBOL_NAME(__compiler_rt_dunder)                // and tailcall __dunder to finish
+
+LOCAL_LABEL(ddiv_zerodenorm):
+  // We come here if either input had exponent 0, so there's at least one zero
+  // or denormal. However, we know there are no infinities or NaNs, because
+  // those were checked first and will have gone to ddiv_naninf below.
+  //
+  // First we must repeat the instruction which extracted the exponent of y
+  // into r5, this time unconditionally, in case the setup code didn't do it.
+  and     r5, r12, yh, lsr #4
+
+  // If either or both input is actually zero, the answer is easy.
+  orrs    lr, xl, xh, lsl #1    // is x zero?
+  beq     LOCAL_LABEL(ddiv_xzero)
+  orrs    lr, yl, yh, lsl #1    // is y zero?
+  beq     LOCAL_LABEL(ddiv_divbyzero)
+
+  // Otherwise, delegate to __dnorm2 to handle denormals, converting them into
+  // a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
+  // exponents at the bottom of their words instead of half way up, so shift
+  // down first.
+  lsr     r4, r4, #16
+  lsr     r5, r5, #16
+  push    {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
+  mov     r0, sp                   // pass it by address
+  bl      SYMBOL_NAME(__compiler_rt_dnorm2)
+  pop     {r0, r1, r2, r3, r4, r5}
+
+  // Rejoin the main code, with the exponent difference in the top half of r4,
+  // and the output sign in the low bit of r4. (The original setup code did the
+  // latter, but we clobbered it while setting up for __dnorm2.)
+  subs    r4, r4, r5               // exponent difference, at the bottom of r4
+  lsls    r4, r4, #16              // move it up to the right place
+  orr     r4, r4, r6, lsr #31      // recover output sign from top bit of r6
+  b       LOCAL_LABEL(ddiv_normalised)          // rejoin the main code
+
+LOCAL_LABEL(ddiv_xzero):
+  // We come here if x=0. We return 0 (of the right sign) if y is not 0, and
+  // the default quiet NaN if both inputs are zero.
+  orrs    lr, yl, yh, lsl #1       // is y zero?
+  beq     LOCAL_LABEL(ddiv_ivo_pop)             // if so, pop registers and return a NaN
+  // We know xl=0 already, so we only need to reset xh to contain the right
+  // output sign. The setup code left that in the high bit of r6.
+  and     xh, r6, #0x80000000
+  pop     {r4,r5,r6,r7,r8,pc}
+
+LOCAL_LABEL(ddiv_divbyzero):
+  // We come here if y=0, but x is not 0 (or we'd have gone to ddiv_xzero above
+  // instead). So we're dividing a nonzero number by zero, and must return
+  // infinity.
+  pop     {r4,r5,r6,r7,r8,lr}
+  eor     xh, xh, yh               // combine signs to get result sign
+  b       LOCAL_LABEL(ddiv_retinf)
+
+LOCAL_LABEL(ddiv_naninf):
+  // We come here knowing that at least one operand is either NaN or infinity.
+  // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
+  // stacked registers first: we won't need that much spare space any more, and
+  // it makes the tailcall easier if we've already done it.
+  pop     {r4,r5,r6,r7,r8,lr}
+
+  // A number is a NaN if its exponent is 0x7ff and at least one bit below that
+  // is set. The CMP + ADC pair here converts the two words xh:xl into a single
+  // word containing xh shifted up by one (throwing away the sign bit which
+  // makes no difference), with its low bit set if xl was nonzero. So if that
+  // is strictly greater than 0xffe00000, then x was a NaN.
+  cmp     xl, #1
+  adc     r12, xh, xh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+  // Now check y in the same way.
+  cmp     yl, #1
+  adc     r12, yh, yh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+
+  // Now we know there are no NaNs. Therefore there's at least one infinity. If
+  // both operands are infinity then we have inf / inf = invalid operation and
+  // must return a NaN. We detect this by XORing the inputs' exponent fields:
+  // knowing one of them is 7FF, they XOR to zero iff the other one is too.
+  eors    r12, xh, yh              // XOR entire top words of the inputs
+  lsl     r12, r12, #1             // shift left to discard the sign bit
+  lsrs    r12, r12, #21            // shift right again to discard mantissas
+  beq     LOCAL_LABEL(ddiv_ivo)                 // if what's left is 0, we have inf / inf
+
+  // Otherwise, there's exactly one infinity, so our answers are easy, but
+  // depend on which operand it is:
+  //   infinity / anything = infinity
+  //   anything / infinity = 0
+  //
+  // Determine if x is the infinity, by bitwise inverting the whole word and
+  // then shifting left and right to isolate its exponent bits.
+  mvn     r12, xh, lsl #1          // invert x, shift left to discard sign
+  lsrs    r12, r12, #21            //   and shift right to discard mantissa
+  eor     xh, xh, yh               // calculate the output sign bit
+  beq     LOCAL_LABEL(ddiv_retinf)              // if x = inf, return infinity of that sign
+  mov     xl, #0                   // otherwise clear all bits of x
+  and     xh, xh, #0x80000000      //   other than the sign bit
+  bx      lr                       //   and return zero of the same sign
+LOCAL_LABEL(ddiv_retinf):
+  // Construct and return an infinity in xh:xl, with whatever sign bit is
+  // already in the top bit of xh.
+  mov     xl, #0                   // clear low word
+  mvn     xh, xh, lsr #31          // shift xh[31] down to bit 0, inverted
+  mvn     xh, xh, lsl #11          // uninvert, and put exponent 0x7ff below it
+  lsl     xh, xh, #20              // shift back up to the top
+  bx      lr
+
+  // Code to construct and return the default quiet NaN, for the cases inf/inf
+  // and 0/0. We provide two entry labels, one for callers who still need to
+  // pop all the registers this function pushed, and one for callers who have
+  // done that already.
+LOCAL_LABEL(ddiv_ivo_pop):
+  pop     {r4,r5,r6,r7,r8,lr}
+LOCAL_LABEL(ddiv_ivo):
+  movw    xh, 0x7ff8
+  lsls    xh, xh, #16
+  mov     xl, #0
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_ddiv)
+
+  // Table of approximate reciprocals.
+  .rodata
+LOCAL_LABEL(reciptbl):
+  .byte 0xFF,0xFD,0xFB,0xF9,0xF7,0xF5,0xF4,0xF2
+  .byte 0xF0,0xEE,0xED,0xEB,0xE9,0xE8,0xE6,0xE4
+  .byte 0xE3,0xE1,0xE0,0xDE,0xDD,0xDB,0xDA,0xD8
+  .byte 0xD7,0xD5,0xD4,0xD3,0xD1,0xD0,0xCF,0xCD
+  .byte 0xCC,0xCB,0xCA,0xC8,0xC7,0xC6,0xC5,0xC4
+  .byte 0xC2,0xC1,0xC0,0xBF,0xBE,0xBD,0xBC,0xBB
+  .byte 0xBA,0xB9,0xB8,0xB7,0xB6,0xB5,0xB4,0xB3
+  .byte 0xB2,0xB1,0xB0,0xAF,0xAE,0xAD,0xAC,0xAB
+  .byte 0xAA,0xA9,0xA8,0xA8,0xA7,0xA6,0xA5,0xA4
+  .byte 0xA3,0xA3,0xA2,0xA1,0xA0,0x9F,0x9F,0x9E
+  .byte 0x9D,0x9C,0x9C,0x9B,0x9A,0x99,0x99,0x98
+  .byte 0x97,0x97,0x96,0x95,0x95,0x94,0x93,0x93
+  .byte 0x92,0x91,0x91,0x90,0x8F,0x8F,0x8E,0x8E
+  .byte 0x8D,0x8C,0x8C,0x8B,0x8B,0x8A,0x89,0x89
+  .byte 0x88,0x88,0x87,0x87,0x86,0x85,0x85,0x84
+  .byte 0x84,0x83,0x83,0x82,0x82,0x81,0x81,0x80
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/muldf3.S b/compiler-rt/lib/builtins/arm/muldf3.S
new file mode 100644
index 0000000000000..b73cd7580fbf2
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/muldf3.S
@@ -0,0 +1,404 @@
+//===-- muldf3.S - double-precision floating point multiplication ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the __muldf3 function (double precision floating point
+// multiplication), with the IEEE-754 default rounding (to nearest, ties to
+// even), for the Arm and Thumb2 ISAs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+#include "crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__muldf3)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __aeabi_dmul
+  VMOV_TO_DOUBLE(d0, r0, r1)
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__muldf3, __aeabi_dmul)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_dmul)
+
+  push    {r4,r5,r6,lr}
+
+  // Check if either input exponent is 000 or 7FF (i.e. not a normalized
+  // number), and if so, branch out of line. If we don't branch out of line,
+  // then we've also extracted the exponents of the input values x and y into
+  // bits 16..26 of r14 and r5 respectively. But if we do, then that hasn't
+  // necessarily been done (because the second AND might have been skipped).
+  ldr     r12, =0x07FF0000
+  ands    r14, r12, xh, lsr #4 // sets Z if exponent of x is 0
+  andsne  r5, r12, yh, lsr #4  // otherwise, sets Z if exponent of y is 0
+  teqne   r14, r12             // otherwise, sets Z if exponent of x is 7FF
+  teqne   r5, r12              // otherwise, sets Z if exponent of y is 7FF
+  beq     LOCAL_LABEL(uncommon)        // branch out of line to handle inf/NaN/0/denorm
+
+  // Calculate the sign of the result, and put it in an unused bit of r14.
+  eor     r4, xh, yh           // XOR the input signs to get the result sign
+  orr     r14, r14, r4, lsr #31 // save it in the low bit of r14
+
+  // Clear the exponent and sign bits from the top word of each mantissa, and
+  // set the leading mantissa bit in each one, so that they're in the right
+  // form to be multiplied.
+  bic     xh, xh, r12, lsl #5     // r12 = 0x07FF0000, so r12 << 5 = 0xFF800000
+  bic     yh, yh, r12, lsl #5
+  orr     xh, xh, #(1 << 20)
+  orr     yh, yh, #(1 << 20)
+
+  // Now we're ready to multiply mantissas. This is also the place we'll come
+  // back to after decoding denormal inputs. The denormal decoding will also
+  // have to set up the same register contents:
+  //  - fractions in xh/xl and yh/yl, with leading bits at bit 20 of xh/yh
+  //  - exponents in r14 and r5, starting at bit 16
+  //  - output sign in r14 bit 0
+LOCAL_LABEL(mul):
+
+  // Multiply the two mantissas as if they were full 64-bit words, delivering a
+  // 128-bit output in four registers. We provide three different ways to do
+  // this, using different instructions.
+  //
+  // Interleaved with the multiplication code, we also compute the output
+  // exponent by adding the input exponents and rebiasing. This takes two
+  // instructions. We schedule each one after a multiplication, to use a delay
+  // slot from the multiplication on CPUs where there is one.
+  //
+  // We add r5 to r14, so that the output exponent is in the top half of r14,
+  // and r5 is freed up to be used in the multiplication.
+  //
+  // We rebias the exponent by subtracting 0x400, which is correct for one of
+  // the two places where the leading bit of the product could end up, and will
+  // need correcting by one in the other case.
+  //
+  // Exit conditions from the three-way #if:
+  //
+  // r4:r5:r6 are the top 96 bits of the 128-bit product, with the leading bit
+  // at either bit 8 or bit 9 of r4. The low bit of r6 is forced to 1 if any of
+  // the low 32 bits of the 128-bit product were set.
+  //
+  // The output sign is still in the low bit of r14; the top half contains the
+  // preliminary output exponent (yet to be adjusted depending on where the
+  // high bit of the product ended up).
+
+#if __ARM_FEATURE_DSP
+  // The UMAAL instruction, which computes a 64-bit product and adds two
+  // separate 32-bit values to it, makes this easy.
+  umull   r6, r4, xh, yl
+  add     r14, r14, r5             // add exponents, freeing up r5
+  umull   r12, r5, xl, yl
+  sub     r14, r14, #0x4000000     // initial rebiasing of exponent
+  umaal   r6, r5, xl, yh
+  umaal   r5, r4, xh, yh
+#elif ARM_FP_DMUL_USE_UMLAL
+  // The UMLAL instruction computes a 64-bit product and adds a 64-bit value to
+  // it. But it doesn't write to the carry flag, so you can't tell if the
+  // addition wrapped. Therefore you have to use it in a way that means the
+  // addition never wraps. Here we do three of the four multiplications (xl*yl,
+  // xl*yh, xh*yh) in a chain, using UMLAL for the top two, in each case with
+  // the 64-bit accumulator consisting of the top half of the previous
+  // multiplication, and a high word set to zero before the UMLAL instruction.
+  //
+  // On Cortex-M3, this is not a win over just using UMULL and doing the
+  // additions by hand, because UMLAL takes two cycles longer than UMULL, and
+  // it also costs a cycle to initialise each of the two high accumulator words
+  // to zero. If the high word of the addend were not zero then those two
+  // cycles would be doing something useful, but as it is, they're wasted time.
+  //
+  // CPUs later than Cortex-M3 - in particular, Cortex-M4 - will do both UMLAL
+  // and UMULL much faster, so that this code is a win over the plain UMULL
+  // code below. But those CPUs typically have UMAAL anyway and will use the
+  // even faster version of the code above. So this code is provided in case
+  // it's useful, but won't be enabled unless you manually #define
+  // ARM_FP_DMUL_USE_UMLAL.
+  umull   r12, r6, xl, yl
+  add     r14, r14, r5             // add exponents, freeing up r5
+  movs    r5, #0
+  umlal   r6, r5, xl, yh
+  movs    r4, #0
+  umlal   r5, r4, xh, yh
+  sub     r14, r14, #0x4000000     // initial rebiasing of exponent
+  umull   xl, yh, xh, yl
+  adds    r6, r6, xl
+  adcs    r5, r5, yh
+  adc     r4, r4, #0
+#else
+  // Simplest approach, using plain UMULL to compute each 64-bit product, and
+  // separate ADD and ADC instructions to do the additions. On Cortex-M3 this
+  // wins over the UMLAL approach: it's one instruction longer, but three
+  // cycles quicker, since each use of UMLAL in the above version costs 2
+  // cycles.
+  umull   r4, r12, xh, yl
+  add     r14, r14, r5             // add exponents, freeing up r5
+  umull   r6, r5, xl, yh
+  sub     r14, r14, #0x4000000     // initial rebiasing of exponent
+  adds    r6, r6, r4
+  adcs    r5, r5, r12              // carry from here is used below
+
+  umull   r4, r12, xh, yh          // r12:r4 is top part
+  adc     yh, r12, #0              // get carry from above addition
+  umull   r12, xh, xl, yl          // xh:r12 is bottom part
+
+  adds    r6, r6, xh
+  adcs    r5, r5, r4
+  adcs    r4, yh, #0
+#endif
+
+  // Now the full 128-bit product of the two mantissas occupies the four
+  // registers r4,r5,r6,r12 (in order from MSW to LSW). Since each input
+  // mantissa was in the range [2^52,2^53), the product is in the range
+  // [2^104,2^106), which means that the lowest-order word r12 is a long way
+  // below the round bit, so that it can only affect cases so close to a
+  // rounding boundary that you need to know if it's nonzero to tell whether
+  // you're rounding to even. Start by freeing up that register, ensuring the
+  // low bit of r6 is set if anything in r12 was nonzero.
+  tst     r12, r12
+  orrne   r6, r6, #1
+
+  // Now we can regard the result as a 96-bit value in r4,r5,r6, with its
+  // leading bit in either bit 8 or 9 of r4. To move that bit up to its final
+  // position in bit 20, we must shift the whole thing left by either 11 or 12
+  // bits. Find out which.
+  tst     r4, #0x200               // is bit 9 set?
+  bne     LOCAL_LABEL(shift11)             // if so, only shift by 11 bits
+
+  // In this branch, we're shifting left by 12 bits. Put the shifted result
+  // back into the output registers xh,xl, and the bits lower than the bottom
+  // mantissa bit into r4.
+  lsls    xh, r4, #12              // shift each input reg left 12
+  lsls    xl, r5, #12
+  lsls    r4, r6, #12
+  orr     xh, xh, r5, lsr #20      // and the top two right by 32-12
+  orr     xl, xl, r6, lsr #20
+
+  b       LOCAL_LABEL(shifted)
+
+LOCAL_LABEL(shift11):
+  // In this branch, we're shifting left by 11 bits instead of 12, and we must
+  // adjust the exponent by 1 to compensate.
+  lsls    xh, r4, #11              // shift each input reg left 11
+  lsls    xl, r5, #11
+  lsls    r4, r6, #11
+  orr     xh, xh, r5, lsr #21      // and the top two right by 32-11
+  orr     xl, xl, r6, lsr #21
+  add     r14, r14, #0x10000       // adjust the exponent
+
+LOCAL_LABEL(shifted):
+  // We've reconverged after shifting the mantissa, so that now the leading 1
+  // bit of the mantissa is in bit 20 of xh, and r4 contains the bits lower
+  // than the bottom of xl.
+
+  // Recombine the sign and exponent into the high bits of xh. If the exponent
+  // is over- or underflowed, this may not give a valid FP result, but because
+  // everything is put on by addition, it will be right "mod 2^64" so that we
+  // can bias the exponent back into range for underflow handling and that will
+  // recover the right sign.
+  //
+  // r14 still has the output sign in its low bit. To extract just the exponent
+  // for adding to xh, we could use BIC to clear that bit, or shift the value
+  // right. We do the latter, which saves a copy of the pre-rounding exponent
+  // in yl, to use later for overflow detection. The shift is ASR, so that if
+  // the exponent is negative due to underflow, it stays negative.
+  asr     yl, r14, #16             // isolate the exponent
+  add     xh, xh, yl, lsl #20      // shift it back up to add to xh
+  add     xh, xh, r14, lsl #31     // then add the sign
+
+  // If we have to handle an underflow, we'll need enough information to
+  // reconstruct the rounding direction. Our strategy is
+  //
+  //  - save the LSW of the output before rounding: if that differs from the
+  //    LSW after rounding then we rounded up
+  //  - save the round word r4: if that is zero then we didn't round at all.
+  //
+  // We're going to branch past the rounding code for a quicker exit in the
+  // case where we're exact. In that case we don't need to save the output LSW
+  // at all, because the zero round word will override whatever it would have
+  // been anyway.
+  movs    r6, r4                   // unconditionally save round word
+  beq     LOCAL_LABEL(rounded)             // branch past rounding code if exact
+  mov     r5, xl                   // and if not, save output LSW too
+
+  // Rounding: we shift r4 left to put the round bit into the carry flag so
+  // that ADCS+ADC will conditionally increment the mantissa. But before we do
+  // the additions, we also check the Z flag, which tells us whether the
+  // remaining 31 bits are all zero. If so, we're either in the round-to-even
+  // (RTE) halfway case, or the exact case - but the exact case never came
+  // through this code at all, so it must be RTE.
+  //
+  // If those 31 bits _aren't_ all zero, we clear the top bit of r4, leaving it
+  // set only in the round-to-even case. Then (r4 >> 31) can be used to clear
+  // the low bit to perform RTE.
+  lsls    r12, r4, #1              // test round word
+  bicne   r4, r4, #0x80000000      // make top bit of r4 into the RTE bit
+  adcs    xl, xl, #0               // conditionally increment the mantissa
+  adc     xh, xh, #0               // ... and carry into its high word
+  bic     xl, xl, r4, lsr #31      // round to even if r4[31] != 0
+
+LOCAL_LABEL(rounded):
+  // Now we've rounded the output. The last thing we must do is check for
+  // overflow and underflow: if neither has happened, we can return.
+  //
+  // yl contains the pre-rounding output exponent minus 1 (so that the leading
+  // mantissa bit incremented it to the right output value). If this is in the
+  // range [0,0x7fd] then the leading bit would have incremented it to
+  // [1,0x7fe], which are non-overflowed output exponents. So an unsigned check
+  // if yl >= 0x7fe detects both overflow and underflow at once.
+  movw    r12, #0x7FE
+  cmp     yl, r12
+  poplo   {r4,r5,r6,pc}
+
+  // We have either an underflow or an overflow. We can tell which it is by
+  // doing a _signed_ comparison of yl with the same value again - and since we
+  // only just did the CMP instruction, we can reuse the same flags.
+  bge     LOCAL_LABEL(overflow)
+
+  // Now we're dealing with an underflow. Set r2 to the rounding direction, by
+  // first checking xl against r5 (where we saved its pre-rounding value) to
+  // see if we rounded up or down, and then overriding that by checking r6
+  // (where we saved the round word) to see if we didn't round at all. In the
+  // latter case the comparison against r5 will deliver nonsense, but then we
+  // overwrite it, so it doesn't matter.
+  cmp     xl, r5                   // did we modify the LSW, i.e. round up?
+  movne   r2, #-1                  // if so, the true value is a bit smaller
+  moveq   r2, #+1                  // else it's a bit bigger
+  cmp     r6, #0                   // except maybe we didn't round at all
+  moveq   r2, #0                   // in which case the true value is exact.
+
+  // Add the IEEE 754 exponent bias, and tail-call __dunder to handle the rest
+  // of the job.
+  add     xh, xh, #0x60000000
+  pop     {r4,r5,r6,lr}
+  b       SYMBOL_NAME(__compiler_rt_dunder)
+
+LOCAL_LABEL(overflow):
+  // Here, we overflowed, so we must return an infinity of the correct sign.
+  // Rebias the exponent, which corrects the sign bit.
+  sub     xh, xh, #0x60000000
+
+  // And pop our scratch registers before falling through into dmul_retinf.
+  pop     {r4,r5,r6,lr}
+
+LOCAL_LABEL(retinf):
+  // This is entered from the overflow handler and also from cases with
+  // infinite inputs. It constructs an infinity, with sign bit equal to the
+  // high bit of xh.
+  //
+  // On entry to here, we expect not to have a stack frame any more, because
+  // one of our callers will have popped it already in order to conditionally
+  // tailcall __dnan2.
+  mov     xl, #0                   // clear low word
+  mvn     xh, xh, lsr #31          // shift xh[31] down to bit 0, inverted
+  mvn     xh, xh, lsl #11          // uninvert, and put exponent 0x7ff below it
+  lsl     xh, xh, #20              // shift back up to the top
+  bx      lr
+
+LOCAL_LABEL(uncommon):
+  // We come here from the entry point, if any input had exponent 0 or 0x7ff.
+  // First we must repeat the instruction from the entry point that sets up r5
+  // with the exponent of y, this time unconditionally, so we know we have both
+  // exponents in the top halves of r14 and r5.
+  and     r5, r12, yh, lsr #4
+
+  // Check if either exponent is 0x7ff, by comparing against the value left in
+  // r12 by the entry point. If so, branch away to handle NaNs and infinities.
+  teq     r14, r12
+  teqne   r5, r12
+  beq     LOCAL_LABEL(naninf)
+
+  // If we didn't branch, we're dealing with finite numbers, including a zero
+  // or a denormal or both.
+  //
+  // First save the output sign.
+  eor     r6, xh, yh
+
+  // Handle zeroes first, because if there's a zero we don't have to worry
+  // about denormals at all.
+  orrs    r4, xl, xh, lsl #1      // is x zero?
+  orrsne  r4, yl, yh, lsl #1      // or is y zero?
+  beq     LOCAL_LABEL(retzero)            // Return zero if so
+
+  // Otherwise, delegate to __dnorm2 to handle denormals, converting them into
+  // a normalised mantissa and an out-of-range exponent. __dnorm2 expects the
+  // exponents at the bottom of their words instead of half way up, so shift
+  // down first, and back up again afterwards.
+  //
+  // This call clobbers r12, because we didn't bother to save it on the stack.
+  // That's fine, because we don't need the constant in it any more. When we go
+  // back to dmul_mul, that will use it as a scratch register.
+  lsr     r4, r14, #16
+  lsr     r5, r5, #16
+  push    {r0, r1, r2, r3, r4, r5} // create a 'struct dnorm2' on the stack
+  mov     r0, sp                   // pass it by address
+  bl      SYMBOL_NAME(__compiler_rt_dnorm2)
+  pop     {r0, r1, r2, r3, r4, r5}
+  lsl     r14, r4, #16
+  lsls    r5, r5, #16
+
+  // Put the output sign at the bottom of r14, the same place the fast path
+  // would have left it. Then rejoin the fast path.
+  orr     r14, r14, r6, lsr #31
+  b       LOCAL_LABEL(mul)
+
+LOCAL_LABEL(retzero):
+  // Return an exact zero, with sign bit from the high bit of r6.
+  mov     xl, #0                  // low word is 0
+  ands    xh, r6, #0x80000000     // high word is 0 except for the sign
+  pop     {r4,r5,r6,pc}
+
+LOCAL_LABEL(naninf):
+  // We come here knowing that at least one operand is either NaN or infinity.
+  // If there's a NaN, we can tailcall __dnan2 to do the right thing. Pop our
+  // stacked registers first: we won't need that much spare space any more, and
+  // it makes the tailcall easier if we've already done it.
+  pop     {r4,r5,r6,lr}
+
+  // A number is a NaN if its exponent is 0x7ff and at least one bit below that
+  // is set. The CMP + ADC pair here converts the two words xh:xl into a single
+  // word containing xh shifted up by one (throwing away the sign bit which
+  // makes no difference), with its low bit set if xl was nonzero. So if that
+  // is strictly greater than 0xffe00000, then x was a NaN.
+  cmp     xl, #1
+  adc     r12, xh, xh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+  // Now check y in the same way.
+  cmp     yl, #1
+  adc     r12, yh, yh
+  cmp     r12, #0xFFE00000
+  bhi     SYMBOL_NAME(__compiler_rt_dnan2)
+
+  // Now we know there are no NaNs. Therefore there's at least one infinity. If
+  // either operand is zero then we have inf * 0 = invalid operation and must
+  // return a NaN.
+  orrs    r12, xl, xh, lsl #1     // are all bits of x zero except the sign?
+  beq     LOCAL_LABEL(retnan)             // if so, x == 0, so y == inf
+  orrs    r12, yl, yh, lsl #1     // same check the other way round
+  beq     LOCAL_LABEL(retnan)
+
+  // If we have an infinity and no NaN, then we just return an infinity of the
+  // correct sign.
+  eor     xh, xh, yh
+  b       LOCAL_LABEL(retinf)
+
+LOCAL_LABEL(retnan):
+  // Return the default NaN, in the case where the inputs were 0 and infinity.
+  movw    xh, 0x7ff8
+  lsls    xh, xh, #16
+  mov     xl, #0
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_dmul)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/divdf3new_test.c b/compiler-rt/test/builtins/Unit/divdf3new_test.c
new file mode 100644
index 0000000000000..866c7cb08e519
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/divdf3new_test.c
@@ -0,0 +1,862 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_divdf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultD to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7ff8000000000000. For the Arm optimized FP implementation, which commits
+// to a more detailed handling of NaNs, we tighten up the check and include
+// some extra test cases specific to that NaN policy.
+#if COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a / b
+COMPILER_RT_ABI double __divdf3(double a, double b);
+
+int test__divdf3(uint64_t a_rep, uint64_t b_rep, uint64_t expected_rep,
+                 int line) {
+  double a = fromRep64(a_rep), b = fromRep64(b_rep);
+  double x = __divdf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep64(x) != expected_rep;
+#else
+  int ret = compareResultD(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error at line %d: __divdf3(%016" PRIx64 ", %016" PRIx64
+           ") = %016" PRIx64 ", expected %016" PRIx64 "\n",
+           line, a_rep, b_rep, toRep64(x), expected_rep);
+  }
+  return ret;
+}
+
+#define test__divdf3(a, b, x) test__divdf3(a, b, x, __LINE__)
+
+int main(void) {
+  int status = 0;
+
+  status |=
+      test__divdf3(0x0000000000000000, 0x0000000000000001, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x000fffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x0010000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x001fffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x4014000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x7fdfffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x7fe0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x8000000000000002, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x800fffffffffffff, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x8010000000000001, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x8020000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0xc008000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0xc01c000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0xffcfffffffffffff, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0xffe0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000001, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x0000000000000001, 0x3fc0000000000000, 0x0000000000000008);
+  status |=
+      test__divdf3(0x0000000000000001, 0x3fe0000000000000, 0x0000000000000002);
+  status |=
+      test__divdf3(0x0000000000000001, 0x4000000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000001, 0x7fefffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000001, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0000000000000001, 0xc000000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000001, 0xffefffffffffffff, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000002, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x0000000000000002, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0000000000000009, 0x4022000000000000, 0x0000000000000001);
+  status |=
+      test__divdf3(0x0000000000000009, 0xc022000000000000, 0x8000000000000001);
+  status |=
+      test__divdf3(0x000ffffffffffff7, 0x3feffffffffffffe, 0x000ffffffffffff8);
+  status |=
+      test__divdf3(0x000ffffffffffffe, 0x3feffffffffffffe, 0x000fffffffffffff);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x3f60000000000000, 0x009ffffffffffffe);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x3fe0000000000000, 0x001ffffffffffffe);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x3ff0000000000000, 0x000fffffffffffff);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x3ff0000000000002, 0x000ffffffffffffd);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0xbff0000000000000, 0x800fffffffffffff);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0010000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x0010000000000000, 0x3ff0000000000001, 0x000fffffffffffff);
+  status |=
+      test__divdf3(0x0010000000000000, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0010000000000001, 0x3ff0000000000002, 0x000fffffffffffff);
+  status |=
+      test__divdf3(0x0010000000000001, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x0010000000000001, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0010000000000002, 0x3ff0000000000006, 0x000ffffffffffffc);
+  status |=
+      test__divdf3(0x001ffffffffffffe, 0x4000000000000000, 0x000fffffffffffff);
+  status |=
+      test__divdf3(0x001fffffffffffff, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x001fffffffffffff, 0x4000000000000000, 0x0010000000000000);
+  status |=
+      test__divdf3(0x001fffffffffffff, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x0020000000000000, 0x0010000000000000, 0x4000000000000000);
+  status |=
+      test__divdf3(0x0020000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x0020000000000000, 0xc000000000000000, 0x8010000000000000);
+  status |=
+      test__divdf3(0x0020000000000000, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x0020000000000001, 0x0010000000000001, 0x4000000000000000);
+  status |=
+      test__divdf3(0x0020000000000001, 0xc000000000000000, 0x8010000000000001);
+  status |=
+      test__divdf3(0x0020000000000003, 0x8010000000000003, 0xc000000000000000);
+  status |=
+      test__divdf3(0x0020000000000003, 0xc000000000000000, 0x8010000000000003);
+  status |=
+      test__divdf3(0x3feffffffffffff7, 0x3feffffffffffffb, 0x3feffffffffffffc);
+  status |=
+      test__divdf3(0x3feffffffffffff7, 0x3feffffffffffffe, 0x3feffffffffffff9);
+  status |=
+      test__divdf3(0x3feffffffffffff8, 0x3feffffffffffffc, 0x3feffffffffffffc);
+  status |=
+      test__divdf3(0x3feffffffffffff8, 0x3feffffffffffffd, 0x3feffffffffffffb);
+  status |=
+      test__divdf3(0x3feffffffffffffa, 0x3feffffffffffff9, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3feffffffffffffb, 0x3feffffffffffff9, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3feffffffffffffc, 0x3feffffffffffff9, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3feffffffffffffc, 0x3feffffffffffffd, 0x3fefffffffffffff);
+  status |=
+      test__divdf3(0x3feffffffffffffc, 0x3feffffffffffffe, 0x3feffffffffffffe);
+  status |=
+      test__divdf3(0x3feffffffffffffc, 0x3fefffffffffffff, 0x3feffffffffffffd);
+  status |=
+      test__divdf3(0x3feffffffffffffc, 0x3ff0000000000001, 0x3feffffffffffffa);
+  status |=
+      test__divdf3(0x3feffffffffffffd, 0x3feffffffffffff9, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3feffffffffffffd, 0x3feffffffffffffc, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3feffffffffffffd, 0x3feffffffffffffe, 0x3fefffffffffffff);
+  status |=
+      test__divdf3(0x3feffffffffffffd, 0x3fefffffffffffff, 0x3feffffffffffffe);
+  status |=
+      test__divdf3(0x3feffffffffffffd, 0x3ff0000000000001, 0x3feffffffffffffb);
+  status |=
+      test__divdf3(0x3feffffffffffffd, 0x3ff0000000000002, 0x3feffffffffffff9);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3feffffffffffff9, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3feffffffffffffc, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3feffffffffffffd, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3fefffffffffffff, 0x3fefffffffffffff);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3ff0000000000001, 0x3feffffffffffffc);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3ff0000000000002, 0x3feffffffffffffa);
+  status |=
+      test__divdf3(0x3feffffffffffffe, 0x3ff0000000000003, 0x3feffffffffffff8);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3feffffffffffff9, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3feffffffffffffc, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3feffffffffffffd, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3feffffffffffffe, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3ff0000000000001, 0x3feffffffffffffd);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3ff0000000000002, 0x3feffffffffffffb);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3ff0000000000003, 0x3feffffffffffff9);
+  status |=
+      test__divdf3(0x3fefffffffffffff, 0x3ff0000000000004, 0x3feffffffffffff7);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3feffffffffffff7, 0x3ff0000000000005);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3feffffffffffff8, 0x3ff0000000000004);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3feffffffffffffb, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3feffffffffffffc, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3feffffffffffffd, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3feffffffffffffe, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3fefffffffffffff, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3ff0000000000001, 0x3feffffffffffffe);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3ff0000000000002, 0x3feffffffffffffc);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3ff0000000000003, 0x3feffffffffffffa);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x3ff0000000000004, 0x3feffffffffffff8);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x3ff0000000000001, 0x3feffffffffffffb, 0x3ff0000000000004);
+  status |=
+      test__divdf3(0x3ff0000000000001, 0x3feffffffffffffd, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3ff0000000000001, 0x3feffffffffffffe, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3ff0000000000001, 0x3fefffffffffffff, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3ff0000000000001, 0x3ff0000000000002, 0x3feffffffffffffe);
+  status |=
+      test__divdf3(0x3ff0000000000001, 0x3ff0000000000003, 0x3feffffffffffffc);
+  status |=
+      test__divdf3(0x3ff0000000000002, 0x3feffffffffffffc, 0x3ff0000000000004);
+  status |=
+      test__divdf3(0x3ff0000000000002, 0x3feffffffffffffd, 0x3ff0000000000004);
+  status |=
+      test__divdf3(0x3ff0000000000002, 0x3feffffffffffffe, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3ff0000000000002, 0x3fefffffffffffff, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3ff0000000000002, 0x3ff0000000000001, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3ff0000000000002, 0x3ff0000000000003, 0x3feffffffffffffe);
+  status |=
+      test__divdf3(0x3ff0000000000003, 0x3feffffffffffffd, 0x3ff0000000000005);
+  status |=
+      test__divdf3(0x3ff0000000000003, 0x3feffffffffffffe, 0x3ff0000000000004);
+  status |=
+      test__divdf3(0x3ff0000000000003, 0x3fefffffffffffff, 0x3ff0000000000004);
+  status |=
+      test__divdf3(0x3ff0000000000003, 0x3ff0000000000001, 0x3ff0000000000002);
+  status |=
+      test__divdf3(0x3ff0000000000004, 0x3feffffffffffffe, 0x3ff0000000000005);
+  status |=
+      test__divdf3(0x3ff0000000000004, 0x3ff0000000000001, 0x3ff0000000000003);
+  status |=
+      test__divdf3(0x3ff0000000000004, 0x3ff0000000000007, 0x3feffffffffffffa);
+  status |=
+      test__divdf3(0x3ff0000000000005, 0x3fefffffffffffff, 0x3ff0000000000006);
+  status |=
+      test__divdf3(0x3ff0000000000006, 0x3ff0000000000008, 0x3feffffffffffffc);
+  status |=
+      test__divdf3(0x3ff0000000000007, 0x3ff0000000000002, 0x3ff0000000000005);
+  status |=
+      test__divdf3(0x3ff0000000000009, 0x3ff0000000000008, 0x3ff0000000000001);
+  status |=
+      test__divdf3(0x3ff199999999999a, 0x3ff3333333333333, 0x3fed555555555556);
+  status |=
+      test__divdf3(0x4000000000000000, 0x3ff0000000000000, 0x4000000000000000);
+  status |=
+      test__divdf3(0x4000000000000000, 0xbff0000000000000, 0xc000000000000000);
+  status |=
+      test__divdf3(0x4008000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x4008000000000000, 0xc008000000000000, 0xbff0000000000000);
+  status |=
+      test__divdf3(0x4008000000000000, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x4014000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x4014000000000000, 0x4014000000000000, 0x3ff0000000000000);
+  status |=
+      test__divdf3(0x4014000000000000, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x401c000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x401c000000000000, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x4020000000000000, 0x4000000000000000, 0x4010000000000000);
+  status |=
+      test__divdf3(0x4022000000000000, 0x4008000000000000, 0x4008000000000000);
+  status |=
+      test__divdf3(0x7f60000000000000, 0x00a0000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fcfffffffffffff, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7fdffffffffffffd, 0xc000000000000000, 0xffcffffffffffffd);
+  status |=
+      test__divdf3(0x7fdfffffffffffff, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fdfffffffffffff, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0x000fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0x3fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0x4000000000000000, 0x7fd0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0x7ff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0xbfe0000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0xc000000000000000, 0xffd0000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000000, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x7fe0000000000003, 0xffd0000000000003, 0xc000000000000000);
+  status |=
+      test__divdf3(0x7feffffffffffffd, 0x4010000000000000, 0x7fcffffffffffffd);
+  status |=
+      test__divdf3(0x7feffffffffffffd, 0xc010000000000000, 0xffcffffffffffffd);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0x0000000000000001, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0x3fefffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0x7fcfffffffffffff, 0x4010000000000000);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0x7fdfffffffffffff, 0x4000000000000000);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0xc000000000000000, 0xffdfffffffffffff);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0xffcfffffffffffff, 0xc010000000000000);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0xfff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x0000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x0000000000000001, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x000fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x0010000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x001fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x3ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x4014000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x7fdfffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x8000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x8000000000000002, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x800fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x8010000000000001, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x8020000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0xc008000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0xc01c000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0xffcfffffffffffff, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0xffefffffffffffff, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x0000000000000003, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x000fffffffffffff, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x0010000000000001, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x0020000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x4000000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x4018000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x7fcfffffffffffff, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x7fd0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x8000000000000004, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x800fffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x8010000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x801fffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0xc010000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0xc020000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0xffd0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0xffdfffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000001, 0x3fe0000000000000, 0x8000000000000002);
+  status |=
+      test__divdf3(0x8000000000000001, 0x4000000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000001, 0x7fefffffffffffff, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000001, 0xc000000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000001, 0xffefffffffffffff, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8000000000000003, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x8000000000000003, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8000000000000004, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x8000000000000004, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x800ffffffffffff8, 0x3feffffffffffffe, 0x800ffffffffffff9);
+  status |=
+      test__divdf3(0x800fffffffffffff, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x800fffffffffffff, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x800fffffffffffff, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x800fffffffffffff, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8010000000000000, 0x3ff0000000000001, 0x800fffffffffffff);
+  status |=
+      test__divdf3(0x8010000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x8010000000000000, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8010000000000001, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x8010000000000001, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x801fffffffffffff, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x801fffffffffffff, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0x8020000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0x8020000000000000, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0x8020000000000001, 0x0010000000000001, 0xc000000000000000);
+  status |=
+      test__divdf3(0x8020000000000005, 0x0010000000000005, 0xc000000000000000);
+  status |=
+      test__divdf3(0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000);
+  status |=
+      test__divdf3(0xbff0000000000000, 0xbff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__divdf3(0xc000000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xc000000000000000, 0x3ff0000000000000, 0xc000000000000000);
+  status |=
+      test__divdf3(0xc000000000000000, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0xc000000000000000, 0xbff0000000000000, 0x4000000000000000);
+  status |=
+      test__divdf3(0xc010000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xc010000000000000, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0xc018000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xc018000000000000, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0xc018000000000000, 0xc008000000000000, 0x4000000000000000);
+  status |=
+      test__divdf3(0xc01c000000000000, 0x401c000000000000, 0xbff0000000000000);
+  status |=
+      test__divdf3(0xc020000000000000, 0x4000000000000000, 0xc010000000000000);
+  status |=
+      test__divdf3(0xc020000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xc020000000000000, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0xc022000000000000, 0xc008000000000000, 0x4008000000000000);
+  status |=
+      test__divdf3(0xffcfffffffffffff, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xffcfffffffffffff, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0xffd0000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xffd0000000000000, 0x7ff0000000000000, 0x8000000000000000);
+  status |=
+      test__divdf3(0xffd0000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xffd0000000000000, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0xffdfffffffffffff, 0x4000000000000000, 0xffcfffffffffffff);
+  status |=
+      test__divdf3(0xffdfffffffffffff, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xffe0000000000000, 0x3fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xffe0000000000000, 0xbfe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xffe0000000000001, 0x7fd0000000000001, 0xc000000000000000);
+  status |=
+      test__divdf3(0xffeffffffffffffd, 0x4010000000000000, 0xffcffffffffffffd);
+  status |=
+      test__divdf3(0xffeffffffffffffd, 0xc010000000000000, 0x7fcffffffffffffd);
+  status |=
+      test__divdf3(0xffefffffffffffff, 0x7fcfffffffffffff, 0xc010000000000000);
+  status |=
+      test__divdf3(0xffefffffffffffff, 0xffcfffffffffffff, 0x4010000000000000);
+  status |=
+      test__divdf3(0xffefffffffffffff, 0xfff0000000000000, 0x0000000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x0000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x0000000000000003, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x000fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x0010000000000001, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x0020000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x4000000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x4018000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x7fd0000000000000, 0xfff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x8000000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x8000000000000004, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x800fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x8010000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x801fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0xc010000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0xc020000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0xffd0000000000000, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0xffefffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__divdf3(0x800ffffffdffffff, 0xc00fff8000000000, 0x0004000fffbfff00);
+  status |=
+      test__divdf3(0xb7fbffffffffffff, 0xffe0000000000007, 0x0000000000000000);
+  status |=
+      test__divdf3(0x3ff660beb3029ffd, 0x3ff52e22fb7ace43, 0x3ff0e79e59ccb735);
+  status |=
+      test__divdf3(0x3ff73ddbc621eb00, 0x3ffb8224c030d747, 0x3feb095d4073d13b);
+  status |=
+      test__divdf3(0x3ff9a3b1ff2bf973, 0x3ff42fdf35d2d3bd, 0x3ff452508f203fca);
+  status |=
+      test__divdf3(0x3ffa2f42f2a01655, 0x3ff01310ba9f33d1, 0x3ffa103474220298);
+  status |=
+      test__divdf3(0x3ffa6b3e65d68478, 0x3ff773ca580800a9, 0x3ff206204bf651cc);
+  status |=
+      test__divdf3(0x3ffae840ed05aaad, 0x3ff374c8afa6bd73, 0x3ff620a0b38357dd);
+  status |=
+      test__divdf3(0x3ffc9bff90e124f7, 0x3ff19678d03f31b9, 0x3ffa06ce5731c244);
+  status |=
+      test__divdf3(0x3ff716518068f63e, 0x3ffea080001fffff, 0x3fe81f4927e2f813);
+  status |=
+      test__divdf3(0x3ff30b70c9e177b3, 0x3ffdc1dbcddeaaf7, 0x3fe47ae453d79b63);
+  status |=
+      test__divdf3(0x3ff690a0c1cf289e, 0x3ffdd0e4ec596ead, 0x3fe837c35c721292);
+  status |=
+      test__divdf3(0x3ff9a9f18698d1c5, 0x3ffdcf214b672807, 0x3feb8cd196d1e2db);
+  status |=
+      test__divdf3(0x3ffc412def95e9f2, 0x3ffe09fd73e44afb, 0x3fee195e4c411819);
+  status |=
+      test__divdf3(0x3ffab674f26df917, 0x3ffe55a80dfd623d, 0x3fec2de561fb628a);
+  status |=
+      test__divdf3(0x3ff15bb10851a33b, 0x3ffe770229894d4f, 0x3fe23b9bdf3ad4d7);
+  status |=
+      test__divdf3(0x3ff6ce035de00c24, 0x3fff04076d288c95, 0x3fe7874738e5ef5e);
+  status |=
+      test__divdf3(0x3ffb0e73f83fd2b4, 0x3fff01150ca4f6e3, 0x3febece97e64ff65);
+  status |=
+      test__divdf3(0x3ff53fff6c6d7043, 0x3fffb55c0bf15be1, 0x3fe57204f8441410);
+  status |=
+      test__divdf3(0x3ffa8aa3bbff7c4b, 0x3fffd530fa74cc5f, 0x3feaae55281a47cf);
+  status |=
+      test__divdf3(0x3ff3004b0d901379, 0x3ffe470662686931, 0x3fe41508eef9d818);
+  status |=
+      test__divdf3(0x3ffac10f29e80b25, 0x3ffe2fba9d423c9d, 0x3fec5c8a8148eb26);
+  status |=
+      test__divdf3(0x3ff8a3e14fe0651f, 0x3ffdeeae50e07679, 0x3fea579ce7a3f61c);
+  status |=
+      test__divdf3(0x3ff168321760dd0d, 0x3ffd382a2b3c2c27, 0x3fe31042c5fcbe35);
+  status |=
+      test__divdf3(0x3ff208350f930e99, 0x3ffc80beeab6d9ed, 0x3fe43e9486314a0e);
+  status |=
+      test__divdf3(0x3ff46a9470b46af6, 0x3ffc2e13c9335b3f, 0x3fe72f150e86f5a1);
+  status |=
+      test__divdf3(0x3ffaf26f45d21562, 0x3ffbe6d631b290e7, 0x3feee7b30b353e95);
+  status |=
+      test__divdf3(0x3ff5cda6f52381df, 0x3ffbe2a5bce4483f, 0x3fe90542a0e62c21);
+  status |=
+      test__divdf3(0x3ff92aeb8209bb69, 0x3ffb57a0bdf7af6f, 0x3fed74754022b839);
+  status |=
+      test__divdf3(0x3ff627c9c1a1903d, 0x3ffb3c161457a7e1, 0x3fea082feee891f0);
+  status |=
+      test__divdf3(0x3ffa5fef91208fd5, 0x3ff68928392cf5e7, 0x3ff2b9c16cd0a6eb);
+  status |=
+      test__divdf3(0x3ffdc6825d6a2ad2, 0x3ff69bb9ca89cd3f, 0x3ff5127c1399515f);
+  status |=
+      test__divdf3(0x3ffd62dbb1150699, 0x3ff6e12d3daf7823, 0x3ff48cd52e787bc5);
+  status |=
+      test__divdf3(0x3ffb9f0e3f946dd2, 0x3ff75a51f01f688b, 0x3ff2ecadebdfdf91);
+  status |=
+      test__divdf3(0x3ffdf21fc13ef609, 0x3ff77a80c8098ae1, 0x3ff46843217c9c90);
+  status |=
+      test__divdf3(0x3ff83f6d28924d31, 0x3ff7cb607bcc758f, 0x3ff04e08e26c84b7);
+  status |=
+      test__divdf3(0x3ffef8774307cea5, 0x3ff849124d13461d, 0x3ff467851369d61a);
+  status |=
+      test__divdf3(0x3ffd7c2259068fa2, 0x3ffa9e9faf8d6845, 0x3ff1b8e24ddeb546);
+  status |=
+      test__divdf3(0x3fffb10b35d3977b, 0x3ffb57a0bdf7af6f, 0x3ff28b8abfdd47c7);
+  status |=
+      test__divdf3(0x3ffdcfa4097387f1, 0x3ffbe6d631b290e7, 0x3ff1184cf4cac16b);
+  status |=
+      test__divdf3(0x3ffcb6231a615d02, 0x3ffb98faef6f9417, 0x3ff0a552a67a8e2d);
+  status |=
+      test__divdf3(0x3ffba5443a5d0a42, 0x3ffb3a5c10922a9d, 0x3ff03ed2622d2a26);
+  status |=
+      test__divdf3(0x3fff3144ae86b33e, 0x3ffa58948417f235, 0x3ff2f17912d557f2);
+  status |=
+      test__divdf3(0x3ffd68635bf6605a, 0x3ff945fce3a79f3f, 0x3ff29e0c7d6617a1);
+  status |=
+      test__divdf3(0x3ff97e6030354676, 0x3ff906f78f460697, 0x3ff04c56a5f3136d);
+  status |=
+      test__divdf3(0x3ffe86f743594e95, 0x3ff8444d7946422d, 0x3ff420b1e63f512e);
+  status |=
+      test__divdf3(0x3fff12a6c5539a9a, 0x3ff7cad48079af09, 0x3ff4e564f736b864);
+  status |=
+      test__divdf3(0x3ffa5371fe989251, 0x3ff6fc5272dc36d1, 0x3ff2533d7a4d0ee8);
+  status |=
+      test__divdf3(0x3ffe18c0547f65d2, 0x3ff6fc9e8dd915ed, 0x3ff4f2e7f917b80e);
+  status |=
+      test__divdf3(0x3ffd7aea8a297055, 0x3ff64eb95d608cd9, 0x3ff52500dc28664c);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultD, so we set all the answers to the canonical NaN
+  // 0x7ff8000000000000, which causes compareResultF to accept any NaN
+  // encoding. We also use the same value as the input NaN in tests that have
+  // one, so that even in EXPECT_EXACT_RESULTS mode these tests should pass,
+  // because 0x7ff8000000000000 is still the exact expected NaN.
+  status |=
+      test__divdf3(0x0000000000000000, 0x0000000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x0000000000000000, 0x8000000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0xfff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x0000000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x8000000000000000, 0x8000000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x7ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0xfff0000000000000, 0xfff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x7ff8000000000000, 0x3ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__divdf3(0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/divdf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7ff8000000000000.
+  status |=
+      test__divdf3(0x0000000000000000, 0x7ff3758244400801, 0x7ffb758244400801);
+  status |=
+      test__divdf3(0x0000000000000000, 0x7fff44d3f65148af, 0x7fff44d3f65148af);
+  status |=
+      test__divdf3(0x0000000000000001, 0x7ff48607b4b37057, 0x7ffc8607b4b37057);
+  status |=
+      test__divdf3(0x0000000000000001, 0x7ff855f2d435b33d, 0x7ff855f2d435b33d);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x7ff169269a674e13, 0x7ff969269a674e13);
+  status |=
+      test__divdf3(0x000fffffffffffff, 0x7ffc80978b2ef0da, 0x7ffc80978b2ef0da);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x7ff3458ad034593d, 0x7ffb458ad034593d);
+  status |=
+      test__divdf3(0x3ff0000000000000, 0x7ffdd8bb98c9f13a, 0x7ffdd8bb98c9f13a);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0x7ff79a8b96250a98, 0x7fff9a8b96250a98);
+  status |=
+      test__divdf3(0x7fefffffffffffff, 0x7ffdcc675b63bb94, 0x7ffdcc675b63bb94);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x7ff018cfaf4d0fff, 0x7ff818cfaf4d0fff);
+  status |=
+      test__divdf3(0x7ff0000000000000, 0x7ff83ad1ab4dfd24, 0x7ff83ad1ab4dfd24);
+  status |=
+      test__divdf3(0x7ff48ce6c0cdd5ac, 0x0000000000000000, 0x7ffc8ce6c0cdd5ac);
+  status |=
+      test__divdf3(0x7ff08a34f3d5385b, 0x0000000000000001, 0x7ff88a34f3d5385b);
+  status |=
+      test__divdf3(0x7ff0a264c1c96281, 0x000fffffffffffff, 0x7ff8a264c1c96281);
+  status |=
+      test__divdf3(0x7ff77ce629e61f0e, 0x3ff0000000000000, 0x7fff7ce629e61f0e);
+  status |=
+      test__divdf3(0x7ff715e2d147fd76, 0x7fefffffffffffff, 0x7fff15e2d147fd76);
+  status |=
+      test__divdf3(0x7ff689a2031f1781, 0x7ff0000000000000, 0x7ffe89a2031f1781);
+  status |=
+      test__divdf3(0x7ff5dfb4a0c8cd05, 0x7ff11c1fe9793a33, 0x7ffddfb4a0c8cd05);
+  status |=
+      test__divdf3(0x7ff5826283ffb5d7, 0x7fff609b83884e81, 0x7ffd826283ffb5d7);
+  status |=
+      test__divdf3(0x7ff7cb03f2e61d42, 0x8000000000000000, 0x7fffcb03f2e61d42);
+  status |=
+      test__divdf3(0x7ff2adc8dfe72c96, 0x8000000000000001, 0x7ffaadc8dfe72c96);
+  status |=
+      test__divdf3(0x7ff4fc0bacc707f2, 0x800fffffffffffff, 0x7ffcfc0bacc707f2);
+  status |=
+      test__divdf3(0x7ff76248c8c9a619, 0xbff0000000000000, 0x7fff6248c8c9a619);
+  status |=
+      test__divdf3(0x7ff367972fce131b, 0xffefffffffffffff, 0x7ffb67972fce131b);
+  status |=
+      test__divdf3(0x7ff188f5ac284e92, 0xfff0000000000000, 0x7ff988f5ac284e92);
+  status |=
+      test__divdf3(0x7ffed4c22e4e569d, 0x0000000000000000, 0x7ffed4c22e4e569d);
+  status |=
+      test__divdf3(0x7ffe95105fa3f339, 0x0000000000000001, 0x7ffe95105fa3f339);
+  status |=
+      test__divdf3(0x7ffb8d33dbb9ecfb, 0x000fffffffffffff, 0x7ffb8d33dbb9ecfb);
+  status |=
+      test__divdf3(0x7ff874e41dc63e07, 0x3ff0000000000000, 0x7ff874e41dc63e07);
+  status |=
+      test__divdf3(0x7ffe27594515ecdf, 0x7fefffffffffffff, 0x7ffe27594515ecdf);
+  status |=
+      test__divdf3(0x7ffeac86d5c69bdf, 0x7ff0000000000000, 0x7ffeac86d5c69bdf);
+  status |=
+      test__divdf3(0x7ff97d657b99f76f, 0x7ff7e4149862a796, 0x7fffe4149862a796);
+  status |=
+      test__divdf3(0x7ffad17c6aa33fad, 0x7ffd898893ad4d28, 0x7ffad17c6aa33fad);
+  status |=
+      test__divdf3(0x7ff96e04e9c3d173, 0x8000000000000000, 0x7ff96e04e9c3d173);
+  status |=
+      test__divdf3(0x7ffec01ad8da3abb, 0x8000000000000001, 0x7ffec01ad8da3abb);
+  status |=
+      test__divdf3(0x7ffd1d565c495941, 0x800fffffffffffff, 0x7ffd1d565c495941);
+  status |=
+      test__divdf3(0x7ffe3d24f1e474a7, 0xbff0000000000000, 0x7ffe3d24f1e474a7);
+  status |=
+      test__divdf3(0x7ffc206f2bb8c8ce, 0xffefffffffffffff, 0x7ffc206f2bb8c8ce);
+  status |=
+      test__divdf3(0x7ff93efdecfb7d3b, 0xfff0000000000000, 0x7ff93efdecfb7d3b);
+  status |=
+      test__divdf3(0x8000000000000000, 0x7ff2ee725d143ac5, 0x7ffaee725d143ac5);
+  status |=
+      test__divdf3(0x8000000000000000, 0x7ffbba26e5c5fe98, 0x7ffbba26e5c5fe98);
+  status |=
+      test__divdf3(0x8000000000000001, 0x7ff7818a1cd26df9, 0x7fff818a1cd26df9);
+  status |=
+      test__divdf3(0x8000000000000001, 0x7ffaee6cc63b5292, 0x7ffaee6cc63b5292);
+  status |=
+      test__divdf3(0x800fffffffffffff, 0x7ff401096edaf79d, 0x7ffc01096edaf79d);
+  status |=
+      test__divdf3(0x800fffffffffffff, 0x7ffbf1778c7a2e59, 0x7ffbf1778c7a2e59);
+  status |=
+      test__divdf3(0xbff0000000000000, 0x7ff2e8fb0201c496, 0x7ffae8fb0201c496);
+  status |=
+      test__divdf3(0xbff0000000000000, 0x7ffcb6a5adb2e154, 0x7ffcb6a5adb2e154);
+  status |=
+      test__divdf3(0xffefffffffffffff, 0x7ff1ea1bfc15d71d, 0x7ff9ea1bfc15d71d);
+  status |=
+      test__divdf3(0xffefffffffffffff, 0x7ffae0766e21efc0, 0x7ffae0766e21efc0);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x7ff3b364cffbdfe6, 0x7ffbb364cffbdfe6);
+  status |=
+      test__divdf3(0xfff0000000000000, 0x7ffd0d3223334ae3, 0x7ffd0d3223334ae3);
+
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}
diff --git a/compiler-rt/test/builtins/Unit/muldf3new_test.c b/compiler-rt/test/builtins/Unit/muldf3new_test.c
new file mode 100644
index 0000000000000..b8a5c64460696
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/muldf3new_test.c
@@ -0,0 +1,832 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_muldf3
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+// By default this test uses compareResultD to check the returned floats, which
+// accepts any returned NaN if the expected result is the canonical NaN value
+// 0x7ff8000000000000. For the Arm optimized FP implementation, which commits
+// to a more detailed handling of NaNs, we tighten up the check and include
+// some extra test cases specific to that NaN policy.
+#if COMPILER_RT_ARM_OPTIMIZED_FP
+#  define EXPECT_EXACT_RESULTS
+#  define ARM_NAN_HANDLING
+#endif
+
+// Returns: a * b
+COMPILER_RT_ABI double __muldf3(double a, double b);
+
+int test__muldf3(uint64_t a_rep, uint64_t b_rep, uint64_t expected_rep,
+                 int line) {
+  double a = fromRep64(a_rep), b = fromRep64(b_rep);
+  double x = __muldf3(a, b);
+#ifdef EXPECT_EXACT_RESULTS
+  int ret = toRep64(x) != expected_rep;
+#else
+  int ret = compareResultD(x, expected_rep);
+#endif
+
+  if (ret) {
+    printf("error at line %d: __muldf3(%016" PRIx64 ", %016" PRIx64
+           ") = %016" PRIx64 ", expected %016" PRIx64 "\n",
+           line, a_rep, b_rep, toRep64(x), expected_rep);
+  }
+  return ret;
+}
+
+#define test__muldf3(a, b, x) test__muldf3(a, b, x, __LINE__)
+
+int main(void) {
+  int status = 0;
+
+  status |=
+      test__muldf3(0x0000000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x000fffffffffffff, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x001fffffffffffff, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x3ff0000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x7fdfffffffffffff, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x8000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x8000000000000002, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x800fffffffffffff, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x8010000000000001, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0x8020000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0xc008000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0xffcfffffffffffff, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0xffe0000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000000, 0xffefffffffffffff, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0000000000000001, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000001, 0x0000000000000001, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000001, 0x3fe0000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0000000000000001, 0x3fefffffffffffff, 0x0000000000000001);
+  status |=
+      test__muldf3(0x0000000000000001, 0x3ff0000000000000, 0x0000000000000001);
+  status |=
+      test__muldf3(0x0000000000000001, 0x4000000000000000, 0x0000000000000002);
+  status |=
+      test__muldf3(0x0000000000000001, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x0000000000000001, 0xbfefffffffffffff, 0x8000000000000001);
+  status |=
+      test__muldf3(0x0000000000000006, 0x3fe0000000000000, 0x0000000000000003);
+  status |=
+      test__muldf3(0x0000000000000006, 0xbfe0000000000000, 0x8000000000000003);
+  status |=
+      test__muldf3(0x0000000000000008, 0x3fc0000000000000, 0x0000000000000001);
+  status |=
+      test__muldf3(0x000ffffffffffff7, 0x8020000000000003, 0x8000000000000000);
+  status |=
+      test__muldf3(0x000ffffffffffff8, 0x3ff0000000000001, 0x000ffffffffffff9);
+  status |=
+      test__muldf3(0x000ffffffffffff8, 0x3ff0000000000008, 0x0010000000000000);
+  status |=
+      test__muldf3(0x000ffffffffffff8, 0xbff0000000000001, 0x800ffffffffffff9);
+  status |=
+      test__muldf3(0x000ffffffffffff8, 0xbff0000000000008, 0x8010000000000000);
+  status |=
+      test__muldf3(0x000ffffffffffffc, 0x4000000000000000, 0x001ffffffffffff8);
+  status |=
+      test__muldf3(0x000ffffffffffffe, 0x3feffffffffffffc, 0x000ffffffffffffc);
+  status |=
+      test__muldf3(0x000ffffffffffffe, 0x3ff0000000000001, 0x000fffffffffffff);
+  status |=
+      test__muldf3(0x000ffffffffffffe, 0xbff0000000000001, 0x800fffffffffffff);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x000ffffffffffffe, 0x0000000000000000);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x3cb0000000000001, 0x0000000000000001);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x3fe0000000000001, 0x0008000000000000);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x3ff0000000000001, 0x0010000000000000);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x4000000000000000, 0x001ffffffffffffe);
+  status |=
+      test__muldf3(0x0010000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0010000000000000, 0x0010000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x0010000000000000, 0x3feffffffffffffe, 0x000fffffffffffff);
+  status |=
+      test__muldf3(0x0010000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x0010000000000000, 0x8010000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x0010000000000000, 0xc000000000000000, 0x8020000000000000);
+  status |=
+      test__muldf3(0x0010000000000001, 0x3feffffffffffffa, 0x000ffffffffffffe);
+  status |=
+      test__muldf3(0x0010000000000001, 0x3feffffffffffffe, 0x0010000000000000);
+  status |=
+      test__muldf3(0x0010000000000001, 0xc000000000000000, 0x8020000000000001);
+  status |=
+      test__muldf3(0x0010000000000002, 0x3feffffffffffffc, 0x0010000000000000);
+  status |=
+      test__muldf3(0x001ffffffffffff8, 0x3fe0000000000000, 0x000ffffffffffffc);
+  status |=
+      test__muldf3(0x001ffffffffffffe, 0x3fe0000000000000, 0x000fffffffffffff);
+  status |=
+      test__muldf3(0x001ffffffffffffe, 0xbfe0000000000000, 0x800fffffffffffff);
+  status |=
+      test__muldf3(0x001fffffffffffff, 0x3fe0000000000000, 0x0010000000000000);
+  status |=
+      test__muldf3(0x001fffffffffffff, 0xbfe0000000000000, 0x8010000000000000);
+  status |=
+      test__muldf3(0x3fe0000000000000, 0x8000000000000001, 0x8000000000000000);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x000ffffffffffffd, 0x000ffffffffffffd);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x0020000000000003, 0x0020000000000003);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x4000000000000000, 0x4000000000000000);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x8000000000000001, 0x8000000000000001);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x8000000000000009, 0x8000000000000009);
+  status |=
+      test__muldf3(0x3ff0000000000001, 0x3ff0000000000001, 0x3ff0000000000002);
+  status |=
+      test__muldf3(0x3ff0000000000001, 0xbff0000000000001, 0xbff0000000000002);
+  status |=
+      test__muldf3(0x3ff0000000000001, 0xbff0000000000002, 0xbff0000000000003);
+  status |=
+      test__muldf3(0x3ff0000000000002, 0x3ff0000000000001, 0x3ff0000000000003);
+  status |=
+      test__muldf3(0x3ff0000000000002, 0x7feffffffffffffe, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x3ff0000000000001, 0x7feffffffffffffe, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x4000000000000000, 0x0010000000000000, 0x0020000000000000);
+  status |=
+      test__muldf3(0x4000000000000000, 0x0010000000000001, 0x0020000000000001);
+  status |=
+      test__muldf3(0x4000000000000000, 0x3ff0000000000000, 0x4000000000000000);
+  status |=
+      test__muldf3(0x4000000000000000, 0x4008000000000000, 0x4018000000000000);
+  status |=
+      test__muldf3(0x4000000000000000, 0x7fd0000000000000, 0x7fe0000000000000);
+  status |=
+      test__muldf3(0x4000000000000000, 0x7fdfffffffffffff, 0x7fefffffffffffff);
+  status |=
+      test__muldf3(0x4000000000000000, 0x800ffffffffffffd, 0x801ffffffffffffa);
+  status |=
+      test__muldf3(0x4000000000000000, 0x8010000000000003, 0x8020000000000003);
+  status |=
+      test__muldf3(0x4000000000000000, 0x8010000000000005, 0x8020000000000005);
+  status |=
+      test__muldf3(0x4000000000000000, 0xbff0000000000000, 0xc000000000000000);
+  status |=
+      test__muldf3(0x4000000000000000, 0xffcffffffffffffd, 0xffdffffffffffffd);
+  status |=
+      test__muldf3(0x4000000000000000, 0xffd0000000000003, 0xffe0000000000003);
+  status |=
+      test__muldf3(0x4007ffffffffffff, 0x3feffffffffffffd, 0x4007fffffffffffd);
+  status |=
+      test__muldf3(0x4007ffffffffffff, 0x3feffffffffffffe, 0x4007fffffffffffe);
+  status |=
+      test__muldf3(0x4007ffffffffffff, 0x3fefffffffffffff, 0x4007fffffffffffe);
+  status |=
+      test__muldf3(0x4007ffffffffffff, 0xbfeffffffffffffd, 0xc007fffffffffffd);
+  status |=
+      test__muldf3(0x4008000000000000, 0x0000000000000002, 0x0000000000000006);
+  status |=
+      test__muldf3(0x4008000000000000, 0x4000000000000000, 0x4018000000000000);
+  status |=
+      test__muldf3(0x4008000000000000, 0x4008000000000000, 0x4022000000000000);
+  status |=
+      test__muldf3(0x4008000000000000, 0xc000000000000000, 0xc018000000000000);
+  status |=
+      test__muldf3(0x4008000000000001, 0x3ff0000000000001, 0x4008000000000003);
+  status |=
+      test__muldf3(0x4008000000000001, 0x3ff0000000000003, 0x4008000000000006);
+  status |=
+      test__muldf3(0x4008000000000001, 0xbff0000000000003, 0xc008000000000006);
+  status |=
+      test__muldf3(0x4010000000000000, 0x0000000000000002, 0x0000000000000008);
+  status |=
+      test__muldf3(0x4010000000000000, 0x7fcfffffffffffff, 0x7fefffffffffffff);
+  status |=
+      test__muldf3(0x4010000000000000, 0xffcfffffffffffff, 0xffefffffffffffff);
+  status |=
+      test__muldf3(0x4013ffffffffffff, 0x3fefffffffffffff, 0x4013fffffffffffe);
+  status |=
+      test__muldf3(0x4014000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x4014000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x4014000000000001, 0x3ff0000000000001, 0x4014000000000002);
+  status |=
+      test__muldf3(0x401bffffffffffff, 0x3feffffffffffffc, 0x401bfffffffffffc);
+  status |=
+      test__muldf3(0x401bffffffffffff, 0x3fefffffffffffff, 0x401bfffffffffffe);
+  status |=
+      test__muldf3(0x401c000000000000, 0x8000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x401c000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x401c000000000001, 0x3ff0000000000001, 0x401c000000000003);
+  status |=
+      test__muldf3(0x7fcffffffffffffd, 0x4010000000000000, 0x7feffffffffffffd);
+  status |=
+      test__muldf3(0x7fcffffffffffffd, 0xc010000000000000, 0xffeffffffffffffd);
+  status |=
+      test__muldf3(0x7fd0000000000000, 0xc000000000000000, 0xffe0000000000000);
+  status |=
+      test__muldf3(0x7fdffffffffffffd, 0xc000000000000008, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7fdfffffffffffff, 0xc000000000000000, 0xffefffffffffffff);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0x4000000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0x7fe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0x7feffffffffffffe, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0xffd0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0xffd0000000000004, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000000, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000009, 0x7feffffffffffffa, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7fe0000000000009, 0xc018000000000002, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7fefffffffffffff, 0x0000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x000fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x001fffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x3ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x7fdfffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x7ff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x8000000000000002, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x800fffffffffffff, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x8010000000000001, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x8020000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0xc008000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0xffe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0xffefffffffffffff, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0xfff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x4018000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x7fefffffffffffff, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x8000000000000004, 0x0000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x8010000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0xc020000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0xffd0000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x8000000000000001, 0x0000000000000001, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8000000000000001, 0x4014000000000000, 0x8000000000000005);
+  status |=
+      test__muldf3(0x8000000000000002, 0x3ff0000000000000, 0x8000000000000002);
+  status |=
+      test__muldf3(0x8000000000000003, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8000000000000003, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x8000000000000004, 0xbff0000000000000, 0x0000000000000004);
+  status |=
+      test__muldf3(0x8000000000000008, 0x3fc0000000000000, 0x8000000000000001);
+  status |=
+      test__muldf3(0x800ffffffffffff7, 0x0020000000000003, 0x8000000000000000);
+  status |=
+      test__muldf3(0x800ffffffffffff7, 0x3ff0000000000001, 0x800ffffffffffff8);
+  status |=
+      test__muldf3(0x800ffffffffffffd, 0xc000000000000000, 0x001ffffffffffffa);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x3ff0000000000001, 0x8010000000000000);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x800ffffffffffffe, 0x0000000000000000);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0xbff0000000000000, 0x000fffffffffffff);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x8010000000000000, 0x0010000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8010000000000000, 0x8010000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x8010000000000001, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8010000000000001, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0x8010000000000001, 0xbff0000000000000, 0x0010000000000001);
+  status |=
+      test__muldf3(0x801ffffffffffffc, 0x3fe0000000000000, 0x800ffffffffffffe);
+  status |=
+      test__muldf3(0x801ffffffffffffc, 0xbfe0000000000000, 0x000ffffffffffffe);
+  status |=
+      test__muldf3(0x801ffffffffffffe, 0x3ff0000000000000, 0x801ffffffffffffe);
+  status |=
+      test__muldf3(0x801fffffffffffff, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0x801fffffffffffff, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x8020000000000000, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0x8020000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xbfefffffffffffff, 0xffefffffffffffff, 0x7feffffffffffffe);
+  status |=
+      test__muldf3(0xbff0000000000000, 0x0000000000000009, 0x8000000000000009);
+  status |=
+      test__muldf3(0xbff0000000000000, 0x0010000000000009, 0x8010000000000009);
+  status |=
+      test__muldf3(0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000);
+  status |=
+      test__muldf3(0xbff0000000000000, 0x4000000000000000, 0xc000000000000000);
+  status |=
+      test__muldf3(0xbff0000000000000, 0xbff0000000000000, 0x3ff0000000000000);
+  status |=
+      test__muldf3(0xbff0000000000000, 0xc000000000000000, 0x4000000000000000);
+  status |=
+      test__muldf3(0xbff0000000000001, 0x3ff0000000000001, 0xbff0000000000002);
+  status |=
+      test__muldf3(0xbff0000000000001, 0xbff0000000000001, 0x3ff0000000000002);
+  status |=
+      test__muldf3(0xbff0000000000001, 0xbff0000000000002, 0x3ff0000000000003);
+  status |=
+      test__muldf3(0xbff0000000000002, 0x3ff0000000000001, 0xbff0000000000003);
+  status |=
+      test__muldf3(0xbff0000000000002, 0xbff0000000000001, 0x3ff0000000000003);
+  status |=
+      test__muldf3(0xc000000000000000, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0xc000000000000000, 0x000ffffffffffffd, 0x801ffffffffffffa);
+  status |=
+      test__muldf3(0xc000000000000000, 0x0010000000000001, 0x8020000000000001);
+  status |=
+      test__muldf3(0xc000000000000000, 0x0010000000000005, 0x8020000000000005);
+  status |=
+      test__muldf3(0xc000000000000000, 0x0010000000000009, 0x8020000000000009);
+  status |=
+      test__muldf3(0xc000000000000000, 0x4008000000000000, 0xc018000000000000);
+  status |=
+      test__muldf3(0xc000000000000000, 0x7fcfffffffffffff, 0xffdfffffffffffff);
+  status |=
+      test__muldf3(0xc000000000000000, 0x7fd0000000000001, 0xffe0000000000001);
+  status |=
+      test__muldf3(0xc000000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xc000000000000000, 0xbff0000000000000, 0x4000000000000000);
+  status |=
+      test__muldf3(0xc000000000000000, 0xc008000000000000, 0x4018000000000000);
+  status |=
+      test__muldf3(0xc007fffffffffffe, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xc007ffffffffffff, 0x3fefffffffffffff, 0xc007fffffffffffe);
+  status |=
+      test__muldf3(0xc008000000000000, 0x4008000000000000, 0xc022000000000000);
+  status |=
+      test__muldf3(0xc008000000000000, 0xc000000000000000, 0x4018000000000000);
+  status |=
+      test__muldf3(0xc008000000000000, 0xc008000000000000, 0x4022000000000000);
+  status |=
+      test__muldf3(0xc008000000000000, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xc008000000000001, 0x3ff0000000000001, 0xc008000000000003);
+  status |=
+      test__muldf3(0xc010000000000000, 0x7fcfffffffffffff, 0xffefffffffffffff);
+  status |=
+      test__muldf3(0xc010000000000000, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0xc010000000000000, 0xffcfffffffffffff, 0x7fefffffffffffff);
+  status |=
+      test__muldf3(0xc010000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xc013fffffffffffe, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xc013ffffffffffff, 0xbfefffffffffffff, 0x4013fffffffffffe);
+  status |=
+      test__muldf3(0xc014000000000001, 0xbff0000000000001, 0x4014000000000002);
+  status |=
+      test__muldf3(0xc01bfffffffffff9, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xc022000000000000, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xc022000000000001, 0xffe0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xffcffffffffffff9, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xffcffffffffffff9, 0xc00fffffffffffff, 0x7feffffffffffff8);
+  status |=
+      test__muldf3(0xffcffffffffffffd, 0x4010000000000000, 0xffeffffffffffffd);
+  status |=
+      test__muldf3(0xffcffffffffffffd, 0xc010000000000000, 0x7feffffffffffffd);
+  status |=
+      test__muldf3(0xffcfffffffffffff, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0xffcfffffffffffff, 0x4000000000000001, 0xffe0000000000000);
+  status |=
+      test__muldf3(0xffcfffffffffffff, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xffd0000000000000, 0x0000000000000000, 0x8000000000000000);
+  status |=
+      test__muldf3(0xffd0000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xffdffffffffffff7, 0x7fd0000000000001, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xffdfffffffffffff, 0x3ff0000000000001, 0xffe0000000000000);
+  status |=
+      test__muldf3(0xffdfffffffffffff, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0xffe0000000000005, 0xffe0000000000001, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xffeffffffffffffd, 0x7fe0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xffeffffffffffffd, 0xc008000000000001, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xffeffffffffffffd, 0xffe0000000000001, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xffefffffffffffff, 0x8000000000000000, 0x0000000000000000);
+  status |=
+      test__muldf3(0xffefffffffffffff, 0xffefffffffffffff, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xffefffffffffffff, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0x4018000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0x7ff0000000000000, 0xfff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0x8000000000000004, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0x8010000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0xc020000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0xffd0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0xfff0000000000000, 0xfff0000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x002ffffffe000000, 0x3fcffffffffffffd, 0x000ffffffeffffff);
+  status |=
+      test__muldf3(0xbfeffeffffffffff, 0x8010000000000100, 0x000fff80000000ff);
+  status |=
+      test__muldf3(0x802ffffffe000000, 0x3fcffffffffffffd, 0x800ffffffeffffff);
+  status |=
+      test__muldf3(0xbfeffeffffffffff, 0x0010000000000100, 0x800fff80000000ff);
+  status |=
+      test__muldf3(0xbf9e8325a5aa6c8d, 0xbf9e8325a5aa6c8d, 0x3f4d180013083955);
+  status |=
+      test__muldf3(0x3ffd25d7ea4fa2d4, 0x3fe4000000000000, 0x3ff237a6f271c5c4);
+  status |=
+      test__muldf3(0x6ffd25d7ea4fa2d4, 0x4fe4000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201d25d7ea4fa2d4, 0x1fd4000000000000, 0x00091bd37938e2e2);
+  status |=
+      test__muldf3(0x3ffd25d7ea4fa2d4, 0x3fe8000000000000, 0x3ff5dc61efbbba1f);
+  status |=
+      test__muldf3(0x6ffd25d7ea4fa2d4, 0x4fe8000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201d25d7ea4fa2d4, 0x1fd8000000000000, 0x000aee30f7dddd10);
+  status |=
+      test__muldf3(0x3ffd25d7ea4fa2d4, 0x3fec000000000000, 0x3ff9811ced05ae7a);
+  status |=
+      test__muldf3(0x6ffd25d7ea4fa2d4, 0x4fec000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201d25d7ea4fa2d4, 0x1fdc000000000000, 0x000cc08e7682d73d);
+  status |=
+      test__muldf3(0x3ff265f139b6c87c, 0x3ff7000000000000, 0x3ffa728ac2f6c032);
+  status |=
+      test__muldf3(0x6ff265f139b6c87c, 0x4ff7000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201265f139b6c87c, 0x1fe7000000000000, 0x000d3945617b6019);
+  status |=
+      test__muldf3(0x3ff265f139b6c87c, 0x3ff5000000000000, 0x3ff825cc9bbfe723);
+  status |=
+      test__muldf3(0x6ff265f139b6c87c, 0x4ff5000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201265f139b6c87c, 0x1fe5000000000000, 0x000c12e64ddff391);
+  status |=
+      test__muldf3(0x3ffe5ab1dc9f12f9, 0x3ff0c1a10c80f0b7, 0x3fffca09666ab16e);
+  status |=
+      test__muldf3(0x6ffe5ab1dc9f12f9, 0x4ff0c1a10c80f0b7, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201e5ab1dc9f12f9, 0x1fe0c1a10c80f0b7, 0x000fe504b33558b7);
+  status |=
+      test__muldf3(0x3ffe5ab1dc9f12f9, 0x3fe73e5ef37f0f49, 0x3ff60c59a0917f00);
+  status |=
+      test__muldf3(0x6ffe5ab1dc9f12f9, 0x4fe73e5ef37f0f49, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201e5ab1dc9f12f9, 0x1fd73e5ef37f0f49, 0x000b062cd048bf80);
+  status |=
+      test__muldf3(0x3ffe5ab1dc9f12f9, 0x3fe8c1a10c80f0b7, 0x3ff77bb12a5d1d75);
+  status |=
+      test__muldf3(0x6ffe5ab1dc9f12f9, 0x4fe8c1a10c80f0b7, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201e5ab1dc9f12f9, 0x1fd8c1a10c80f0b7, 0x000bbdd8952e8ebb);
+  status |=
+      test__muldf3(0x3ffc6be665de3b1d, 0x3fe52d156619a0cb, 0x3ff2ced9f056fba8);
+  status |=
+      test__muldf3(0x6ffc6be665de3b1d, 0x4fe52d156619a0cb, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201c6be665de3b1d, 0x1fd52d156619a0cb, 0x0009676cf82b7dd4);
+  status |=
+      test__muldf3(0x3ffc6be665de3b1d, 0x3fead2ea99e65f35, 0x3ff7d2ffa8765d03);
+  status |=
+      test__muldf3(0x6ffc6be665de3b1d, 0x4fead2ea99e65f35, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201c6be665de3b1d, 0x1fdad2ea99e65f35, 0x000be97fd43b2e82);
+  status |=
+      test__muldf3(0x3ff1c0635d3cd39d, 0x3ff5c9b956d0b54b, 0x3ff82c50eb71ac34);
+  status |=
+      test__muldf3(0x6ff1c0635d3cd39d, 0x4ff5c9b956d0b54b, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x2011c0635d3cd39d, 0x1fe5c9b956d0b54b, 0x000c162875b8d61a);
+  status |=
+      test__muldf3(0x3ff1c0635d3cd39d, 0x3ff23646a92f4ab5, 0x3ff434a77da664d4);
+  status |=
+      test__muldf3(0x6ff1c0635d3cd39d, 0x4ff23646a92f4ab5, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x2011c0635d3cd39d, 0x1fe23646a92f4ab5, 0x000a1a53bed3326a);
+  status |=
+      test__muldf3(0x3ff1c0635d3cd39d, 0x3ffa3646a92f4ab5, 0x3ffd14d92c44cea3);
+  status |=
+      test__muldf3(0x6ff1c0635d3cd39d, 0x4ffa3646a92f4ab5, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x2011c0635d3cd39d, 0x1fea3646a92f4ab5, 0x000e8a6c96226751);
+  status |=
+      test__muldf3(0x3ff1c0635d3cd39d, 0x3ff1c9b956d0b54b, 0x3ff3bc381422774d);
+  status |=
+      test__muldf3(0x6ff1c0635d3cd39d, 0x4ff1c9b956d0b54b, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x2011c0635d3cd39d, 0x1fe1c9b956d0b54b, 0x0009de1c0a113ba6);
+  status |=
+      test__muldf3(0x3ff907065fd11389, 0x3fe46bad37af52b9, 0x3feff135e5756ec7);
+  status |=
+      test__muldf3(0x6ff907065fd11389, 0x4fe46bad37af52b9, 0x7feff135e5756ec7);
+  status |=
+      test__muldf3(0x201907065fd11389, 0x1fd46bad37af52b9, 0x0007fc4d795d5bb2);
+  status |=
+      test__muldf3(0x3ff907065fd11389, 0x3feb9452c850ad47, 0x3ff591ee9cfee5ea);
+  status |=
+      test__muldf3(0x6ff907065fd11389, 0x4feb9452c850ad47, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201907065fd11389, 0x1fdb9452c850ad47, 0x000ac8f74e7f72f5);
+  status |=
+      test__muldf3(0x3ff761c03e198df7, 0x3fe7f47c731d43c7, 0x3ff180e675617e83);
+  status |=
+      test__muldf3(0x6ff761c03e198df7, 0x4fe7f47c731d43c7, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201761c03e198df7, 0x1fd7f47c731d43c7, 0x0008c0733ab0bf41);
+  status |=
+      test__muldf3(0x3ffce6d1246c46fb, 0x3ff0b3469ded2bcd, 0x3ffe2aa6f74c0ffd);
+  status |=
+      test__muldf3(0x6ffce6d1246c46fb, 0x4ff0b3469ded2bcd, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201ce6d1246c46fb, 0x1fe0b3469ded2bcd, 0x000f15537ba607fe);
+  status |=
+      test__muldf3(0x3ffd5701100ec79d, 0x3fee654fee13094b, 0x3ffbde74e37bb583);
+  status |=
+      test__muldf3(0x6ffd5701100ec79d, 0x4fee654fee13094b, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201d5701100ec79d, 0x1fde654fee13094b, 0x000def3a71bddac1);
+  status |=
+      test__muldf3(0x3ffce1a06e8bcfd3, 0x3ff01c54436a605b, 0x3ffd14c361885d61);
+  status |=
+      test__muldf3(0x6ffce1a06e8bcfd3, 0x4ff01c54436a605b, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201ce1a06e8bcfd3, 0x1fe01c54436a605b, 0x000e8a61b0c42eb0);
+  status |=
+      test__muldf3(0x3ff21d1a5ca518a5, 0x3ff29f0ce1150f2d, 0x3ff514cd72d743f2);
+  status |=
+      test__muldf3(0x6ff21d1a5ca518a5, 0x4ff29f0ce1150f2d, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x20121d1a5ca518a5, 0x1fe29f0ce1150f2d, 0x000a8a66b96ba1f9);
+  status |=
+      test__muldf3(0x3ff031a98dbf97ba, 0x3ff4000000000000, 0x3ff43e13f12f7da8);
+  status |=
+      test__muldf3(0x6ff031a98dbf97ba, 0x4ff4000000000000, 0x7ff0000000000000);
+  status |=
+      test__muldf3(0x201031a98dbf97ba, 0x1fe4000000000000, 0x000a1f09f897bed4);
+  status |=
+      test__muldf3(0x0000000000000003, 0xc00fffffffffffff, 0x800000000000000c);
+  status |=
+      test__muldf3(0x0000000000000003, 0x400fffffffffffff, 0x000000000000000c);
+  status |=
+      test__muldf3(0x8000000000000003, 0xc00fffffffffffff, 0x000000000000000c);
+  status |=
+      test__muldf3(0x8000000000000003, 0x400fffffffffffff, 0x800000000000000c);
+  status |=
+      test__muldf3(0x0000000000000003, 0xc00ffffffffffffd, 0x800000000000000c);
+  status |=
+      test__muldf3(0x0000000000000003, 0x400ffffffffffffd, 0x000000000000000c);
+  status |=
+      test__muldf3(0x8000000000000003, 0xc00ffffffffffffd, 0x000000000000000c);
+  status |=
+      test__muldf3(0x8000000000000003, 0x400ffffffffffffd, 0x800000000000000c);
+  status |=
+      test__muldf3(0x1e51f703ee090000, 0x1e5c8000e4000000, 0x0000000000000001);
+  status |=
+      test__muldf3(0x1e561ed9745fdb21, 0x1e57255ca25b68e1, 0x0000000000000001);
+  status |=
+      test__muldf3(0x7feffffffff00000, 0xc000000000080000, 0xfff0000000000000);
+
+  // Test that the result of an operation is a NaN at all when it should be.
+  //
+  // In most configurations these tests' results are checked compared using
+  // compareResultD, so we set all the answers to the canonical NaN
+  // 0x7ff8000000000000, which causes compareResultF to accept any NaN
+  // encoding. We also use the same value as the input NaN in tests that have
+  // one, so that even in EXPECT_EXACT_RESULTS mode these tests should pass,
+  // because 0x7ff8000000000000 is still the exact expected NaN.
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x0000000000000000, 0x7ff8000000000000);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x8000000000000000, 0x7ff8000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0x7ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__muldf3(0x8000000000000000, 0xfff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+  status |=
+      test__muldf3(0x7ff8000000000000, 0x3ff0000000000000, 0x7ff8000000000000);
+  status |=
+      test__muldf3(0x7ff8000000000000, 0x7ff8000000000000, 0x7ff8000000000000);
+
+#ifdef ARM_NAN_HANDLING
+  // Tests specific to the NaN handling of Arm hardware, mimicked by
+  // arm/muldf3.S:
+  //
+  //  - a quiet NaN is distinguished by the top mantissa bit being 1
+  //
+  //  - if a signalling NaN appears in the input, the output quiet NaN is
+  //    obtained by setting its top mantissa bit and leaving everything else
+  //    unchanged
+  //
+  //  - if both operands are signalling NaNs then the output NaN is derived
+  //    from the first operand
+  //
+  //  - if both operands are quiet NaNs then the output NaN is the first
+  //    operand
+  //
+  //  - invalid operations not involving an input NaN return the quiet
+  //    NaN with fewest bits set, 0x7ff8000000000000.
+  status |=
+      test__muldf3(0x0000000000000000, 0x7ff3758244400801, 0x7ffb758244400801);
+  status |=
+      test__muldf3(0x0000000000000000, 0x7fff44d3f65148af, 0x7fff44d3f65148af);
+  status |=
+      test__muldf3(0x0000000000000001, 0x7ff48607b4b37057, 0x7ffc8607b4b37057);
+  status |=
+      test__muldf3(0x0000000000000001, 0x7ff855f2d435b33d, 0x7ff855f2d435b33d);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x7ff169269a674e13, 0x7ff969269a674e13);
+  status |=
+      test__muldf3(0x000fffffffffffff, 0x7ffc80978b2ef0da, 0x7ffc80978b2ef0da);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x7ff3458ad034593d, 0x7ffb458ad034593d);
+  status |=
+      test__muldf3(0x3ff0000000000000, 0x7ffdd8bb98c9f13a, 0x7ffdd8bb98c9f13a);
+  status |=
+      test__muldf3(0x7fefffffffffffff, 0x7ff79a8b96250a98, 0x7fff9a8b96250a98);
+  status |=
+      test__muldf3(0x7fefffffffffffff, 0x7ffdcc675b63bb94, 0x7ffdcc675b63bb94);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x7ff018cfaf4d0fff, 0x7ff818cfaf4d0fff);
+  status |=
+      test__muldf3(0x7ff0000000000000, 0x7ff83ad1ab4dfd24, 0x7ff83ad1ab4dfd24);
+  status |=
+      test__muldf3(0x7ff48ce6c0cdd5ac, 0x0000000000000000, 0x7ffc8ce6c0cdd5ac);
+  status |=
+      test__muldf3(0x7ff08a34f3d5385b, 0x0000000000000001, 0x7ff88a34f3d5385b);
+  status |=
+      test__muldf3(0x7ff0a264c1c96281, 0x000fffffffffffff, 0x7ff8a264c1c96281);
+  status |=
+      test__muldf3(0x7ff77ce629e61f0e, 0x3ff0000000000000, 0x7fff7ce629e61f0e);
+  status |=
+      test__muldf3(0x7ff715e2d147fd76, 0x7fefffffffffffff, 0x7fff15e2d147fd76);
+  status |=
+      test__muldf3(0x7ff689a2031f1781, 0x7ff0000000000000, 0x7ffe89a2031f1781);
+  status |=
+      test__muldf3(0x7ff5dfb4a0c8cd05, 0x7ff11c1fe9793a33, 0x7ffddfb4a0c8cd05);
+  status |=
+      test__muldf3(0x7ff5826283ffb5d7, 0x7fff609b83884e81, 0x7ffd826283ffb5d7);
+  status |=
+      test__muldf3(0x7ff7cb03f2e61d42, 0x8000000000000000, 0x7fffcb03f2e61d42);
+  status |=
+      test__muldf3(0x7ff2adc8dfe72c96, 0x8000000000000001, 0x7ffaadc8dfe72c96);
+  status |=
+      test__muldf3(0x7ff4fc0bacc707f2, 0x800fffffffffffff, 0x7ffcfc0bacc707f2);
+  status |=
+      test__muldf3(0x7ff76248c8c9a619, 0xbff0000000000000, 0x7fff6248c8c9a619);
+  status |=
+      test__muldf3(0x7ff367972fce131b, 0xffefffffffffffff, 0x7ffb67972fce131b);
+  status |=
+      test__muldf3(0x7ff188f5ac284e92, 0xfff0000000000000, 0x7ff988f5ac284e92);
+  status |=
+      test__muldf3(0x7ffed4c22e4e569d, 0x0000000000000000, 0x7ffed4c22e4e569d);
+  status |=
+      test__muldf3(0x7ffe95105fa3f339, 0x0000000000000001, 0x7ffe95105fa3f339);
+  status |=
+      test__muldf3(0x7ffb8d33dbb9ecfb, 0x000fffffffffffff, 0x7ffb8d33dbb9ecfb);
+  status |=
+      test__muldf3(0x7ff874e41dc63e07, 0x3ff0000000000000, 0x7ff874e41dc63e07);
+  status |=
+      test__muldf3(0x7ffe27594515ecdf, 0x7fefffffffffffff, 0x7ffe27594515ecdf);
+  status |=
+      test__muldf3(0x7ffeac86d5c69bdf, 0x7ff0000000000000, 0x7ffeac86d5c69bdf);
+  status |=
+      test__muldf3(0x7ff97d657b99f76f, 0x7ff7e4149862a796, 0x7fffe4149862a796);
+  status |=
+      test__muldf3(0x7ffad17c6aa33fad, 0x7ffd898893ad4d28, 0x7ffad17c6aa33fad);
+  status |=
+      test__muldf3(0x7ff96e04e9c3d173, 0x8000000000000000, 0x7ff96e04e9c3d173);
+  status |=
+      test__muldf3(0x7ffec01ad8da3abb, 0x8000000000000001, 0x7ffec01ad8da3abb);
+  status |=
+      test__muldf3(0x7ffd1d565c495941, 0x800fffffffffffff, 0x7ffd1d565c495941);
+  status |=
+      test__muldf3(0x7ffe3d24f1e474a7, 0xbff0000000000000, 0x7ffe3d24f1e474a7);
+  status |=
+      test__muldf3(0x7ffc206f2bb8c8ce, 0xffefffffffffffff, 0x7ffc206f2bb8c8ce);
+  status |=
+      test__muldf3(0x7ff93efdecfb7d3b, 0xfff0000000000000, 0x7ff93efdecfb7d3b);
+  status |=
+      test__muldf3(0x8000000000000000, 0x7ff2ee725d143ac5, 0x7ffaee725d143ac5);
+  status |=
+      test__muldf3(0x8000000000000000, 0x7ffbba26e5c5fe98, 0x7ffbba26e5c5fe98);
+  status |=
+      test__muldf3(0x8000000000000001, 0x7ff7818a1cd26df9, 0x7fff818a1cd26df9);
+  status |=
+      test__muldf3(0x8000000000000001, 0x7ffaee6cc63b5292, 0x7ffaee6cc63b5292);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x7ff401096edaf79d, 0x7ffc01096edaf79d);
+  status |=
+      test__muldf3(0x800fffffffffffff, 0x7ffbf1778c7a2e59, 0x7ffbf1778c7a2e59);
+  status |=
+      test__muldf3(0xbff0000000000000, 0x7ff2e8fb0201c496, 0x7ffae8fb0201c496);
+  status |=
+      test__muldf3(0xbff0000000000000, 0x7ffcb6a5adb2e154, 0x7ffcb6a5adb2e154);
+  status |=
+      test__muldf3(0xffefffffffffffff, 0x7ff1ea1bfc15d71d, 0x7ff9ea1bfc15d71d);
+  status |=
+      test__muldf3(0xffefffffffffffff, 0x7ffae0766e21efc0, 0x7ffae0766e21efc0);
+  status |=
+      test__muldf3(0xfff0000000000000, 0x7ff3b364cffbdfe6, 0x7ffbb364cffbdfe6);
+  status |=
+      test__muldf3(0xfff0000000000000, 0x7ffd0d3223334ae3, 0x7ffd0d3223334ae3);
+
+#endif // ARM_NAN_HANDLING
+
+  return status;
+}

From 20f4289db0cde1c4e9347d953f7ebcf702ddf3b1 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 14 May 2026 09:44:18 +0100
Subject: [PATCH 42/95] [LV][NFC] Generate full CHECK lines for
 reduction-small-size.ll (#197632)

---
 .../AArch64/reduction-small-size.ll           | 199 ++++++++++++++----
 1 file changed, 161 insertions(+), 38 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
index 89a69e64e0a88..00d73a7ab6825 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -1,9 +1,9 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
 ; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-; CHECK-LABEL: @reduction_i8
 ;
 ; char reduction_i8(char *a, char *b, int n) {
 ;   char sum = 0;
@@ -12,18 +12,69 @@ target triple = "aarch64--linux-gnu"
 ;   return sum;
 ; }
 ;
-; CHECK: vector.body:
-; CHECK:   phi <16 x i8>
-; CHECK:   load <16 x i8>
-; CHECK:   load <16 x i8>
-; CHECK:   add <16 x i8>
-; CHECK:   add <16 x i8>
-;
-; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8>
-; CHECK:   zext i8 [[Rdx]] to i32
-;
 define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: define i8 @reduction_i8(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP_12:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_12]], label %[[ITER_CHECK:.*]], [[FOR_COND_CLEANUP:label %.*]]
+; CHECK:       [[ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP0]], 12
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483632
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_VEC4:%.*]] = and i64 [[TMP0]], 2147483644
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc nuw <4 x i32> [[TMP8]] to <4 x i8>
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i8> [ [[TMP9]], %[[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX5]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i8> [[VEC_PHI6]], [[WIDE_LOAD7]]
+; CHECK-NEXT:    [[TMP13]] = add <4 x i8> [[TMP12]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP15:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP13]])
+; CHECK-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[N_VEC4]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N10]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+;
 entry:
   %cmp.12 = icmp sgt i32 %n, 0
   br i1 %cmp.12, label %for.body.preheader, label %for.cond.cleanup
@@ -58,7 +109,6 @@ for.body:
   br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
 }
 
-; CHECK-LABEL: @reduction_i16_1
 ;
 ; short reduction_i16_1(short *a, short *b, int n) {
 ;   short sum = 0;
@@ -67,18 +117,38 @@ for.body:
 ;   return sum;
 ; }
 ;
-; CHECK: vector.body:
-; CHECK:   phi <8 x i16>
-; CHECK:   load <8 x i16>
-; CHECK:   load <8 x i16>
-; CHECK:   add <8 x i16>
-; CHECK:   add <8 x i16>
-;
-; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16>
-; CHECK:   zext i16 [[Rdx]] to i32
-;
 define i16 @reduction_i16_1(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: define i16 @reduction_i16_1(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP_16:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_16]], label %[[FOR_BODY_PREHEADER:.*]], [[FOR_COND_CLEANUP:label %.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i8], ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8], ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[VEC_PHI]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP4]] = add <8 x i16> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP6]] to i32
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
 entry:
   %cmp.16 = icmp sgt i32 %n, 0
   br i1 %cmp.16, label %for.body.preheader, label %for.cond.cleanup
@@ -113,7 +183,6 @@ for.body:
   br i1 %exitcond, label %for.cond.for.cond.cleanup_crit_edge, label %for.body
 }
 
-; CHECK-LABEL: @reduction_i16_2
 ;
 ; short reduction_i16_2(char *a, char *b, int n) {
 ;   short sum = 0;
@@ -122,20 +191,74 @@ for.body:
 ;   return sum;
 ; }
 ;
-; CHECK: vector.body:
-; CHECK:   phi <16 x i16>
-; CHECK:   [[Ld1:%[a-zA-Z0-9.]+]] = load <16 x i8>
-; CHECK:   zext <16 x i8> [[Ld1]] to <16 x i16>
-; CHECK:   [[Ld2:%[a-zA-Z0-9.]+]] = load <16 x i8>
-; CHECK:   zext <16 x i8> [[Ld2]] to <16 x i16>
-; CHECK:   add <16 x i16>
-; CHECK:   add <16 x i16>
-;
-; CHECK: middle.block:
-; CHECK:   [[Rdx:%[a-zA-Z0-9.]+]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16>
-; CHECK:   zext i16 [[Rdx]] to i32
-;
 define i16 @reduction_i16_2(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %n) {
+; CHECK-LABEL: define i16 @reduction_i16_2(
+; CHECK-SAME: ptr readonly captures(none) [[A:%.*]], ptr readonly captures(none) [[B:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP_14:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP_14]], label %[[ITER_CHECK:.*]], [[FOR_COND_CLEANUP:label %.*]]
+; CHECK:       [[ITER_CHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP0]], 12
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483632
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP2]]
+; CHECK-NEXT:    [[TMP6]] = add <16 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP6]])
+; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_VEC4:%.*]] = and i64 [[TMP0]], 2147483644
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP10]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
+; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i32>
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX5]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[WIDE_LOAD8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[VEC_PHI6]], [[TMP12]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP14]]
+; CHECK-NEXT:    [[TMP17]] = and <4 x i32> [[TMP16]], splat (i32 65535)
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 4
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP16]] to <4 x i16>
+; CHECK-NEXT:    [[TMP20:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP19]])
+; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP20]] to i32
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[N_VEC4]], [[TMP0]]
+; CHECK-NEXT:    br i1 [[CMP_N10]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+;
 entry:
   %cmp.14 = icmp sgt i32 %n, 0
   br i1 %cmp.14, label %for.body.preheader, label %for.cond.cleanup

From 7b09a4efbff5002e9f81ad7e6f49d6e90b976d49 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 14 May 2026 09:46:55 +0100
Subject: [PATCH 43/95] [LV] Fix the cost model for freeze instructions
 (#197188)

While working on a PR to add a cost model for VPDerivedIV recipes I
noticed that a loop in or_reduction_with_freeze:

test/Transforms/LoopVectorize/AArch64/reduction-cost.ll

stopped vectorising because the cost model decided it was no longer
worth it. However, the main cause of this was the incredibly high cost
(14) of freeze for VF=2. We were using the cost of a vector mul
instruction as a proxy for the freeze cost, which is incredibly bad for
an AArch64 target without SVE since the operation needs scalarising.

As far as I understand, the freeze instruction does not lead to any
actual code being generated and acts merely as a barrier to potentially
unsafe optimisations. As such, I've updated the cost model to return 0
instead.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp          | 2 ++
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp           | 9 ++++++---
 .../Transforms/LoopVectorize/AArch64/reduction-cost.ll   | 4 +---
 .../LoopVectorize/X86/CostModel/vpinstruction-cost.ll    | 6 +++---
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 00d378adf9c5d..7213d4ae795ec 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5523,6 +5523,8 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (VF.isScalable())
       return InstructionCost::getInvalid();
     return TTI.getArithmeticInstrCost(Instruction::Mul, RetTy, Config.CostKind);
+  case Instruction::Freeze:
+    return TTI::TCC_Free;
   default:
     // This opcode is unknown. Assume that it is the same as 'mul'.
     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 27569f9966de0..397366bae9af9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1000,9 +1000,12 @@ InstructionCost VPRecipeWithIRFlags::getCostForRecipeWithOpcode(
         RHSInfo, Operands, CtxI, &Ctx.TLI);
   }
   case Instruction::Freeze:
-    // This opcode is unknown. Assume that it is the same as 'mul'.
-    return Ctx.TTI.getArithmeticInstrCost(Instruction::Mul, ResultTy,
-                                          Ctx.CostKind);
+    // NOTE: The only way to ask for the cost is via getInstructionCost, which
+    // requires the actual vector instruction. Instead, both here and in the
+    // LoopVectorizationCostModel::getInstructionCost the costs mirror the
+    // current behaviour in llvm/Analysis/TargetTransformInfoImpl.h to keep
+    // them in sync.
+    return TTI::TCC_Free;
   case Instruction::ExtractValue:
     return Ctx.TTI.getInsertExtractValueCost(Instruction::ExtractValue,
                                              Ctx.CostKind);
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll
index 8c8348cf5700f..4bb6bf39413d0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-cost.ll
@@ -60,8 +60,6 @@ exit:
   ret i64 %res
 }
 
-; The scalar cost of this loop must include the freeze's cost, otherwise VF=2
-; is incorrectly rejected as unprofitable.
 define i32 @or_reduction_with_freeze(ptr %dst, ptr %src) {
 ; CHECK-LABEL: define i32 @or_reduction_with_freeze(
 ; CHECK-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]]) {
@@ -75,7 +73,7 @@ define i32 @or_reduction_with_freeze(ptr %dst, ptr %src) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST6]], [[SRC7]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 3
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 18
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 10
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[DST1]] to i3
diff --git a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
index 12d32872e1453..aeb222e677e63 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/CostModel/vpinstruction-cost.ll
@@ -104,7 +104,7 @@ define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) {
 ; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
 ; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %g.src = getelementptr inbounds i64, ptr %src, i64 %iv
 ; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %l = load i64, ptr %g.src, align 8
-; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %fr = freeze i64 %l
+; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %fr = freeze i64 %l
 ; CHECK:  LV: Found an estimated cost of 0 for VF 1 For instruction: %g.dst = getelementptr inbounds i64, ptr %dst, i64 %iv
 ; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: store i64 %fr, ptr %g.dst, align 8
 ; CHECK:  LV: Found an estimated cost of 1 for VF 1 For instruction: %iv.next = add nuw nsw i64 %iv, 1
@@ -117,7 +117,7 @@ define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) {
 ; CHECK:  Cost of 0 for VF 2: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]>
 ; CHECK:  Cost of 0 for VF 2: vp<[[VP5:%[0-9]+]]> = vector-pointer inbounds ir<%g.src>
 ; CHECK:  Cost of 1 for VF 2: WIDEN ir<%l> = load vp<[[VP5]]>
-; CHECK:  Cost of 2 for VF 2: WIDEN ir<%fr> = freeze ir<%l>
+; CHECK:  Cost of 0 for VF 2: WIDEN ir<%fr> = freeze ir<%l>
 ; CHECK:  Cost of 0 for VF 2: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]>
 ; CHECK:  Cost of 0 for VF 2: vp<[[VP6:%[0-9]+]]> = vector-pointer inbounds ir<%g.dst>
 ; CHECK:  Cost of 1 for VF 2: WIDEN store vp<[[VP6]]>, ir<%fr>
@@ -142,7 +142,7 @@ define void @test_vpinstruction_freeze_cost(ptr %src, ptr noalias %dst) {
 ; CHECK:  Cost of 0 for VF 4: CLONE ir<%g.src> = getelementptr inbounds ir<%src>, vp<[[VP4]]>
 ; CHECK:  Cost of 0 for VF 4: vp<[[VP5]]> = vector-pointer inbounds ir<%g.src>
 ; CHECK:  Cost of 1 for VF 4: WIDEN ir<%l> = load vp<[[VP5]]>
-; CHECK:  Cost of 2 for VF 4: WIDEN ir<%fr> = freeze ir<%l>
+; CHECK:  Cost of 0 for VF 4: WIDEN ir<%fr> = freeze ir<%l>
 ; CHECK:  Cost of 0 for VF 4: CLONE ir<%g.dst> = getelementptr inbounds ir<%dst>, vp<[[VP4]]>
 ; CHECK:  Cost of 0 for VF 4: vp<[[VP6]]> = vector-pointer inbounds ir<%g.dst>
 ; CHECK:  Cost of 1 for VF 4: WIDEN store vp<[[VP6]]>, ir<%fr>

From d2af73c9fe5ebf5c2671c127dfdd86877465740c Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 14 May 2026 10:03:32 +0100
Subject: [PATCH 44/95] [compiler-rt][ARM] Optimized double-precision FP
 comparisons (#179924)

The structure of these comparison functions consists of a header file
containing the main code, and several `.S` files that include that
header with different macro definitions, so that they can use the same
procedure to determine the logical comparison result and then just
translate it into a return value in different ways.
---
 compiler-rt/lib/builtins/CMakeLists.txt       |   8 +
 compiler-rt/lib/builtins/arm/cmpdf2.S         |  64 ++
 compiler-rt/lib/builtins/arm/dcmp.h           | 212 ++++++
 compiler-rt/lib/builtins/arm/gedf2.S          |  61 ++
 compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S  |  61 ++
 compiler-rt/lib/builtins/arm/thumb1/dcmp.h    | 231 +++++++
 compiler-rt/lib/builtins/arm/thumb1/gedf2.S   |  60 ++
 .../lib/builtins/arm/thumb1/unorddf2.S        |  60 ++
 compiler-rt/lib/builtins/arm/unorddf2.S       |  71 ++
 .../test/builtins/Unit/comparedf2new_test.c   | 619 ++++++++++++++++++
 10 files changed, 1447 insertions(+)
 create mode 100644 compiler-rt/lib/builtins/arm/cmpdf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/dcmp.h
 create mode 100644 compiler-rt/lib/builtins/arm/gedf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/dcmp.h
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/gedf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/unorddf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/unorddf2.S
 create mode 100644 compiler-rt/test/builtins/Unit/comparedf2new_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index c7e50c714845a..d5034cd4d286a 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -471,6 +471,9 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
       arm/adddf3.S
       arm/muldf3.S
       arm/divdf3.S
+      arm/cmpdf2.S
+      arm/gedf2.S
+      arm/unorddf2.S
       )
     set_source_files_properties(${assembly_files}
       PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
@@ -523,11 +526,16 @@ set_special_properties(arm/adddf3.S SUPERSEDES subdf3.c PROVIDES subdf3)
 if(COMPILER_RT_ARM_OPTIMIZED_FP)
   set(thumb1_base_SOURCES
     arm/thumb1/mulsf3.S
+    arm/thumb1/cmpdf2.S
+    arm/thumb1/gedf2.S
+    arm/thumb1/unorddf2.S
     arm/fnan2.c
     arm/fnorm2.c
     arm/funder.c
     ${thumb1_base_SOURCES}
   )
+  set_special_properties(arm/thumb1/cmpdf2.S
+    SUPERSEDES comparedf2.c PROVIDES comparedf2)
 endif()
 
 set(arm_EABI_RT_SOURCES
diff --git a/compiler-rt/lib/builtins/arm/cmpdf2.S b/compiler-rt/lib/builtins/arm/cmpdf2.S
new file mode 100644
index 0000000000000..fa6db64e8c1f7
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/cmpdf2.S
@@ -0,0 +1,64 @@
+//===-- cmpdf2.S - double-precision floating point comparison -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpdf2: it's a three-way compare
+// which returns <0 if x<y, 0 if x==y, and >0 if x>y. If the result is
+// unordered (i.e. x or y or both is NaN) then it returns >0.
+//
+// This also makes it suitable for use as all of __eqdf2, __nedf2, __ltdf2 or
+// __ledf2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+#include "crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+
+op0h .req xh
+op0l .req xl
+op1h .req yh
+op1l .req yl
+.macro SetReturnRegister
+  mov r0, #0
+  movhi r0, #1
+  movlo r0, #-1
+.endm
+.macro SetReturnRegisterNE
+  movne r0, #-1
+  movhi r0, #1
+.endm
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__cmpdf2)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __compiler_rt_softfp_cmpdf2
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpdf2, __compiler_rt_softfp_cmpdf2)
+#endif
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__ledf2, __cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltdf2, __cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqdf2, __cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__nedf2, __cmpdf2)
+
+DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpdf2)
+  #include "dcmp.h"
+
+LOCAL_LABEL(NaN):
+  mov r0, #+1
+  bx lr
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpdf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/dcmp.h b/compiler-rt/lib/builtins/arm/dcmp.h
new file mode 100644
index 0000000000000..2dd7a36e857f7
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/dcmp.h
@@ -0,0 +1,212 @@
+//===-- dcmp.h - shared code for double-precision FP comparison functions -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This code is the skeleton of a double-precision FP compare, with two details
+// left out: which input value is in which register, and how to make the return
+// value. It allows the main comparison logic to be shared between (for
+// example) __ledf2 and __gedf2, varying only those details.
+//
+//===----------------------------------------------------------------------===//
+
+// How to use this header file:
+//
+// This header file is expected to be #included from inside a function
+// definition in a .S file. The source file including this header should
+// provide the following:
+//
+// op0h, op0l, op1h, op1l: register aliases (via .req) for the registers
+// containing the input operands.
+//  - For most comparisons, op0h,op0l will correspond to xh,xl, and op1h,op1l
+//    to yh,yl (as defined in turn in crt_endian.h).
+//  - But a function with the reversed semantics of __aeabi_cdrcmple wil define
+//    them the other way round.
+//
+// SetReturnRegister: an assembly macro that looks at the PSR flags and sets up
+// an appropriate return value in r0, for the cases that do *not* involve NaN.
+//  - On entry to this macro, the condition codes LO, EQ and HI indicate that
+//    op0 < op1, op0 == op1 or op0 > op1 respectively.
+//  - For functions that return a result in the flags, this macro can be empty,
+//    because those are the correct flags to return anyway.
+//  - Functions that return a boolean in r0 should set it up by checking the
+//    flags.
+//
+// SetReturnRegisterNE: a macro that does the same thing as SetReturnRegister,
+// except that if the Z flag is set, it instead does nothing at all. (This
+// macro must not assume that the flags were set by a single CMP: in
+// particular, C=0 but Z=1 is possible on entry to this macro, so you must not
+// use the LO condition code and assume it is mutually exclusive with EQ.)
+//
+// LOCAL_LABEL(NaN): a label defined within the compare function, after the
+// #include of this header. Called when at least one input is a NaN, and sets
+// up the appropriate return value for that case.
+
+// --------------------------------------------------
+// The actual entry point of the compare function.
+//
+// The basic plan is to start by ORing together the two inputs. This tells us
+// two things:
+//  - the top bit of the output tells us whether both inputs are positive, or
+//    whether at least one is negative
+//  - if the 11 exponent bits of the output are not all 1, then there are
+//    definitely no NaNs, so a fast path can handle most non-NaN cases.
+
+// clang-format off
+
+  // First diverge control for the negative-numbers case.
+  orrs    r12, op0h, op1h
+  bmi     LOCAL_LABEL(negative)         // high bit set => at least one negative input
+
+  // Here, both inputs are positive. Try adding 1<<20 to their bitwise OR in
+  // r12. This will carry all the way into the top bit, setting the N flag, if
+  // all 11 exponent bits were set.
+  cmn     r12, #1 << 20
+  bmi     LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs
+
+  // The fastest fast path: both inputs positive and we could easily tell there
+  // were no NaNs. So we just compare op0 and op1 as unsigned integers.
+  cmp     op0h, op1h
+  SetReturnRegisterNE
+  bxne    lr
+  cmp     op0l, op1l
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(NaNInf_check_positive):
+  // Second tier for positive numbers. We come here if both inputs are
+  // positive, but our fast initial check didn't manage to rule out a NaN. But
+  // it's not guaranteed that there _is_ a NaN, for two reasons:
+  //
+  //  1. An input with exponent 0x7FF might be an infinity instead. Those
+  //     behave normally under comparison.
+  //
+  //  2. There might not even _be_ an input with exponent 0x7FF. All we know so
+  //     far is that the two inputs ORed together had all the exponent bits
+  //     set. So each of those bits is set in _at least one_ of the inputs, but
+  //     not necessarily all in the _same_ input.
+  //
+  // Test each exponent individually for 0x7FF, using the same CMN idiom as
+  // above. If neither one carries into the sign bit then we have no NaNs _or_
+  // infinities and can compare the registers and return again.
+  cmn     op0h, #1 << 20
+  cmnpl   op1h, #1 << 20
+  bmi     LOCAL_LABEL(NaN_check_positive)
+
+  // Second-tier return path, now we've ruled out anything difficult. By this
+  // time we know that the two operands have different exponents (because the
+  // exponents' bitwise OR is 0x7FF but neither one is 0x7FF by itself, so each
+  // must have a set bit not present in the other). So we only need to compare
+  // the high words.
+  cmp     op0h, op1h
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(NaN_check_positive):
+  // Third tier for positive numbers. Here we know that at least one of the
+  // inputs has exponent 0x7FF. But they might still be infinities rather than
+  // NaNs. So now we must check whether there's an actual NaN.
+  //
+  // We do this by shifting the high word of each input left to get rid of the
+  // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in
+  // the low word. Then we check if the result is _greater_ than 0xFFE00000
+  // (but not equal), via adding 0x00200000 to it and testing for the HI
+  // condition (carry flag set, but Z clear).
+  //
+  // We could have skipped the second-tier check and done this more rigorous
+  // test immediately. But that would cost an extra instruction in the case
+  // where there are no infinities or NaNs, and we assume that that is so much
+  // more common that it's worth optimizing for.
+  cmp     op0l, #1           // set C if op0l is nonzero
+  adc     op0h, op0h, op0h   // shift op0h left, bringing in the C bit
+  cmp     op1l, #1           // set C if op1l is nonzero
+  adc     op1h, op1h, op1h   // shift op1h left, bringing in the C bit
+  cmn     op0h, #1 << 21     // if HI, then op0 is a NaN
+  cmnls   op1h, #1 << 21     // if not HI, then do the same check for op1
+  bhi     LOCAL_LABEL(NaN)           // now, if HI, there's definitely a NaN
+
+  // Now we've finally ruled out NaNs! And we still know both inputs are
+  // positive. So the third-tier return path can just compare the top words
+  // again. (The fact that we've just shifted them left doesn't make a
+  // difference.)
+  cmp     op0h, op1h
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(negative):
+  // We come here if at least one operand is negative. We haven't checked for
+  // NaNs at all yet (the sign check came first), so repeat the first-tier
+  // check strategy of seeing if all exponent bits are set in r12.
+  //
+  // On this path, the sign bit in r12 is set, so if adding 1 to the low
+  // exponent bit carries all the way through into the sign bit, it will
+  // _clear_ the sign bit rather than setting it. So we expect MI to be the
+  // "definitely no NaNs" result, where it was PL on the positive branch.
+  cmn     r12, #1 << 20
+  bpl     LOCAL_LABEL(NaNInf_check_negative)
+
+  // Now we have no NaNs, but at least one negative number. This gives us two
+  // complications:
+  //
+  //  1. Floating-point numbers are sign/magnitude, not two's complement, so we
+  //     have to consider separately the cases of "both negative" and "one of
+  //     each sign".
+  //
+  //  2. -0 and +0 are required to compare equal.
+  //
+  // But problem #1 is not as hard as it sounds! If both operands are negative,
+  // then we can get the result we want by comparing them as unsigned integers
+  // the opposite way round, because the input with the smaller value (as an
+  // integer) is the larger number in an FP ordering sense. And if one operand
+  // is negative and the other is positive, the _same_ reversed comparison
+  // works, because the positive number (with zero sign bit) will always
+  // compare less than the negative one in an unsigned-integers sense.
+  //
+  // So we only have to worry about problem #2, signed zeroes. This only
+  // affects the answer if _both_ operands are zero. So we check that by
+  // testing all bits of both operands apart from the sign bit.
+  orrs    r12, op0l, op0h, LSL #1 // EQ if op0 is zero
+  orrseq  r12, op1l, op1h, LSL #1 // now only EQ if both are zero
+  cmpne   op1h, op0h              // otherwise, compare them backwards
+  SetReturnRegisterNE
+  bxne    lr
+  cmp     op1l, op0l
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(NaNInf_check_negative):
+  // Second tier for negative numbers: we know the OR of the exponents is 0xFF,
+  // but again, we might not have either _actual_ exponent 0xFF, and also, an
+  // exponent 0xFF might be an infinity instead of a NaN.
+  //
+  // On this path we've already branched twice (once for negative numbers and
+  // once for the first-tier NaN check), so we'll just go straight to the
+  // precise check for NaNs.
+  //
+  // Like the NaNInf_check_positive case, we do each NaN check by making a
+  // word consisting of (high word << 1) OR (1 if low word is nonzero). But
+  // unlike the positive case, we can't make those words _in place_,
+  // overwriting op0h and op1h themselves, because that would shift the sign
+  // bits off the top, and we still need the sign bits to get the comparison
+  // right. (In the positive case, we knew both sign bits were 0, enabling a
+  // shortcut.)
+  cmp     op0l, #1           // set C if op0l is nonzero
+  adc     r12, op0h, op0h    // shift op0h left, bringing in the C bit
+  cmn     r12, #1 << 21      // if HI, then op0 is a NaN
+  bhi     LOCAL_LABEL(NaN)
+  cmp     op1l, #1           // set C if op1l is nonzero
+  adc     r12, op1h, op1h    // shift op1h left, bringing in the C bit
+  cmn     r12, #1 << 21      // if HI, then op1 is a NaN
+  bhi     LOCAL_LABEL(NaN)
+
+  // Now we've ruled out NaNs, so we can just compare the two input registers
+  // and return. On this path we _don't_ need to check for the special case of
+  // comparing two zeroes, because we only came here if the bitwise OR of the
+  // exponent fields was 0x7FF, which means the exponents can't both have been
+  // zero! So we can _just_ do the reversed CMP and finish.
+  cmp     op1h, op0h
+  SetReturnRegister
+  bx      lr
diff --git a/compiler-rt/lib/builtins/arm/gedf2.S b/compiler-rt/lib/builtins/arm/gedf2.S
new file mode 100644
index 0000000000000..18d99a312b00d
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/gedf2.S
@@ -0,0 +1,61 @@
+//===-- gedf2.S - double-precision floating point comparison --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpdf2, except for its NaN
+// handling. It's a three-way compare which returns <0 if x<y, 0 if x==y, and
+// >0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it
+// returns <0, where __cmpdf2 would return >0.
+//
+// This also makes it suitable for use as __gtdf2 or __gedf2 (or __eqdf2 or
+// __nedf2).
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+#include "crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0h .req xh
+op0l .req xl
+op1h .req yh
+op1l .req yl
+.macro SetReturnRegister
+  mov r0, #0
+  movhi r0, #1
+  movlo r0, #-1
+.endm
+.macro SetReturnRegisterNE
+  movne r0, #-1
+  movhi r0, #1
+.endm
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__gedf2)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __compiler_rt_softfp_gedf2
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gedf2, __compiler_rt_softfp_gedf2)
+#endif
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtdf2, __gedf2)
+
+DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_gedf2)
+  #include "dcmp.h"
+
+LOCAL_LABEL(NaN):
+  mov r0, #-1
+  bx lr
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gedf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S b/compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S
new file mode 100644
index 0000000000000..3047a6f22e2ce
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/cmpdf2.S
@@ -0,0 +1,61 @@
+//===-- cmpdf2.S - double-precision floating point comparison -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpdf2: it's a three-way compare
+// which returns <0 if x<y, 0 if x==y, and >0 if x>y. If the result is
+// unordered (i.e. x or y or both is NaN) then it returns >0.
+//
+// This also makes it suitable for use as all of __eqdf2, __nedf2, __ltdf2 or
+// __ledf2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+#include "../crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0h .req xh
+op0l .req xl
+op1h .req yh
+op1l .req yl
+.macro ReturnResult
+  bhi 0f
+  blo 1f
+  movs r0, #0
+  // This macro is always called immediately before returning from the
+  // function, so it's safe to use the same return instruction here, instead of
+  // wasting time branching forward to the end of the macro.
+  pop     {r4,r5,r6,pc}
+0:
+  movs r0, #1
+  pop     {r4,r5,r6,pc}
+1:
+  movs r0, #1
+  rsbs r0, r0, #0
+  pop     {r4,r5,r6,pc}
+.endm
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpdf2, __compiler_rt_softfp_cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__ledf2, __cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltdf2, __cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqdf2, __cmpdf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__nedf2, __cmpdf2)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_cmpdf2)
+  #include "dcmp.h"
+
+LOCAL_LABEL(NaN):
+  movs r0, #1
+  pop     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpdf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/dcmp.h b/compiler-rt/lib/builtins/arm/thumb1/dcmp.h
new file mode 100644
index 0000000000000..d0c1e2ddcb489
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/dcmp.h
@@ -0,0 +1,231 @@
+//===-- dcmp.h - shared code for double-precision FP comparison functions -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This code is the skeleton of a double-precision FP compare, with two details
+// left out: which input value is in which register, and how to make the return
+// value. It allows the main comparison logic to be shared between (for
+// example) __ledf2 and __gedf2, varying only those details.
+//
+//===----------------------------------------------------------------------===//
+
+// How to use this header file:
+//
+// This header file is expected to be #included from inside a function
+// definition in a .S file. The source file including this header should
+// provide the following:
+//
+// op0h, op0l, op1h, op1l: register aliases (via .req) for the registers
+// containing the input operands.
+//  - For most comparisons, op0h,op0l will correspond to xh,xl, and op1h,op1l
+//    to yh,yl (as defined in turn in crt_endian.h).
+//  - But a function with the reversed semantics of __aeabi_cdrcmple wil define
+//    them the other way round.
+//
+// ReturnResult: an assembly macro that looks at the PSR flags, sets up an
+// appropriate return value in r0, and returns it, for the cases that do *not*
+// involve NaN.
+//  - On entry to this macro, the condition codes LO, EQ and HI indicate that
+//    op0 < op1, op0 == op1 or op0 > op1 respectively.
+//  - For functions that return a result in the flags, this macro can just
+//    return immediately, because those are the correct flags to return anyway.
+//  - Functions that return a boolean in r0 should set it up by checking the
+//    flags.
+//
+// LOCAL_LABEL(NaN): a label defined within the compare function, after the
+// #include of this header. Called when at least one input is a NaN, and sets
+// up the appropriate return value for that case.
+
+// --------------------------------------------------
+// The actual entry point of the compare function.
+//
+// The basic plan is to start by ORing together the two inputs. This tells us
+// two things:
+//  - the top bit of the output tells us whether both inputs are positive, or
+//    whether at least one is negative
+//  - if the 11 exponent bits of the output are not all 1, then there are
+//    definitely no NaNs, so a fast path can handle most non-NaN cases.
+
+// clang-format off
+
+  push    {r4,r5,r6,lr}
+
+  // Set up the constant 1 << 20 in a register, which we'll need on all
+  // branches.
+  movs    r5, #1
+  lsls    r5, r5, #20
+
+  // First diverge control for the negative-numbers case.
+  movs    r4, op0h
+  orrs    r4, r4, op1h
+  bmi     LOCAL_LABEL(negative)         // high bit set => at least one negative input
+
+  // Here, both inputs are positive. Try adding 1<<20 to their bitwise OR in
+  // r4. This will carry all the way into the top bit, setting the N flag, if
+  // all 11 exponent bits were set.
+  cmn     r4, r5
+  bmi     LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs
+
+  // The fastest fast path: both inputs positive and we could easily tell there
+  // were no NaNs. So we just compare op0 and op1 as unsigned integers.
+  cmp     op0h, op1h
+  beq     LOCAL_LABEL(low_word_positive)
+  ReturnResult
+LOCAL_LABEL(low_word_positive):
+  cmp     op0l, op1l
+  ReturnResult
+
+LOCAL_LABEL(NaNInf_check_positive):
+  // Second tier for positive numbers. We come here if both inputs are
+  // positive, but our fast initial check didn't manage to rule out a NaN. But
+  // it's not guaranteed that there _is_ a NaN, for two reasons:
+  //
+  //  1. An input with exponent 0x7FF might be an infinity instead. Those
+  //     behave normally under comparison.
+  //
+  //  2. There might not even _be_ an input with exponent 0x7FF. All we know so
+  //     far is that the two inputs ORed together had all the exponent bits
+  //     set. So each of those bits is set in _at least one_ of the inputs, but
+  //     not necessarily all in the _same_ input.
+  //
+  // Test each exponent individually for 0x7FF, using the same CMN idiom as
+  // above. If neither one carries into the sign bit then we have no NaNs _or_
+  // infinities and can compare the registers and return again.
+  cmn     op0h, r5
+  bmi     LOCAL_LABEL(NaN_check_positive)
+  cmn     op1h, r5
+  bmi     LOCAL_LABEL(NaN_check_positive)
+
+  // Second-tier return path, now we've ruled out anything difficult. By this
+  // time we know that the two operands have different exponents (because the
+  // exponents' bitwise OR is 0x7FF but neither one is 0x7FF by itself, so each
+  // must have a set bit not present in the other). So we only need to compare
+  // the high words.
+  cmp     op0h, op1h
+  ReturnResult
+
+LOCAL_LABEL(NaN_check_positive):
+  // Third tier for positive numbers. Here we know that at least one of the
+  // inputs has exponent 0x7FF. But they might still be infinities rather than
+  // NaNs. So now we must check whether there's an actual NaN.
+  //
+  // We do this by shifting the high word of each input left to get rid of the
+  // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in
+  // the low word. Then we check if the result is _greater_ than 0xFFE00000
+  // (but not equal), via adding 0x00200000 to it and testing for the HI
+  // condition (carry flag set, but Z clear).
+  //
+  // We could have skipped the second-tier check and done this more rigorous
+  // test immediately. But that would cost an extra instruction in the case
+  // where there are no infinities or NaNs, and we assume that that is so much
+  // more common that it's worth optimizing for.
+  lsls    r6, r5, #1         // set r6 = 1<<21
+  cmp     op0l, #1           // set C if op0l is nonzero
+  adcs    op0h, op0h, op0h   // shift op0h left, bringing in the C bit
+  cmn     op0h, r6           // if HI, then op0 is a NaN
+  bhi     LOCAL_LABEL(NaN)
+  cmp     op1l, #1           // set C if op1l is nonzero
+  adcs    op1h, op1h, op1h   // shift op1h left, bringing in the C bit
+  cmn     op1h, r6           // if HI, then op1 is a NaN
+  bhi     LOCAL_LABEL(NaN)
+
+  // Now we've finally ruled out NaNs! And we still know both inputs are
+  // positive. So the third-tier return path can just compare the top words
+  // again. (The fact that we've just shifted them left doesn't make a
+  // difference.)
+  cmp     op0h, op1h
+  ReturnResult
+
+LOCAL_LABEL(negative):
+  // We come here if at least one operand is negative. We haven't checked for
+  // NaNs at all yet (the sign check came first), so repeat the first-tier
+  // check strategy of seeing if all exponent bits are set in r12.
+  //
+  // On this path, the sign bit in r12 is set, so if adding 1 to the low
+  // exponent bit carries all the way through into the sign bit, it will
+  // _clear_ the sign bit rather than setting it. So we expect MI to be the
+  // "definitely no NaNs" result, where it was PL on the positive branch.
+  cmn     r4, r5
+  bpl     LOCAL_LABEL(NaNInf_check_negative)
+
+  // Now we have no NaNs, but at least one negative number. This gives us two
+  // complications:
+  //
+  //  1. Floating-point numbers are sign/magnitude, not two's complement, so we
+  //     have to consider separately the cases of "both negative" and "one of
+  //     each sign".
+  //
+  //  2. -0 and +0 are required to compare equal.
+  //
+  // But problem #1 is not as hard as it sounds! If both operands are negative,
+  // then we can get the result we want by comparing them as unsigned integers
+  // the opposite way round, because the input with the smaller value (as an
+  // integer) is the larger number in an FP ordering sense. And if one operand
+  // is negative and the other is positive, the _same_ reversed comparison
+  // works, because the positive number (with zero sign bit) will always
+  // compare less than the negative one in an unsigned-integers sense.
+  //
+  // So we only have to worry about problem #2, signed zeroes. This only
+  // affects the answer if _both_ operands are zero. So we check that by
+  // testing all bits of both operands apart from the sign bit.
+  lsls    r6, r4, #1         // logical OR of both high words except the signs
+  orrs    r6, r6, op0l       // combine that with the low word of op0
+  orrs    r6, r6, op1l       // and op1, so now only EQ if both are zero
+  beq     LOCAL_LABEL(equal)
+  // Now we've ruled out confusing zero cases, just compare the operands in
+  // reverse sense.
+  cmp     op1h, op0h
+  beq     LOCAL_LABEL(low_word_negative)
+  ReturnResult
+LOCAL_LABEL(low_word_negative):
+  cmp     op1l, op0l
+  ReturnResult
+
+LOCAL_LABEL(equal):
+  // We come here if we know the inputs are supposed to compare equal. Set up
+  // the flags by comparing a register with itself.
+  //
+  // (We might have come here via a BEQ, in which case we know Z=1, but we also
+  // need C=1 for our caller to get _all_ the right flags.)
+  cmp     r0, r0             // compare a register with itself
+  ReturnResult
+
+LOCAL_LABEL(NaNInf_check_negative):
+  // Second tier for negative numbers: we know the OR of the exponents is 0xFF,
+  // but again, we might not have either _actual_ exponent 0xFF, and also, an
+  // exponent 0xFF might be an infinity instead of a NaN.
+  //
+  // On this path we've already branched twice (once for negative numbers and
+  // once for the first-tier NaN check), so we'll just go straight to the
+  // precise check for NaNs.
+  //
+  // Like the NaNInf_check_positive case, we do each NaN check by making a
+  // word consisting of (high word << 1) OR (1 if low word is nonzero). But
+  // unlike the positive case, we can't make those words _in place_,
+  // overwriting op0h and op1h themselves, because that would shift the sign
+  // bits off the top, and we still need the sign bits to get the comparison
+  // right. (In the positive case, we knew both sign bits were 0, enabling a
+  // shortcut.)
+  lsls    r6, r5, #1         // set r6 = 1<<21
+  movs    r4, op0h           // copy op0h into a scratch register to modify
+  cmp     op0l, #1           // set C if op0l is nonzero
+  adcs    r4, r4, r4         // shift left, bringing in the C bit
+  cmn     r4, r6             // if HI, then op0 is a NaN
+  bhi     LOCAL_LABEL(NaN)
+  movs    r4, op1h           // copy op1h into a scratch register to modify
+  cmp     op1l, #1           // set C if op1l is nonzero
+  adcs    r4, r4, r4         // shift left, bringing in the C bit
+  cmn     r4, r6             // if HI, then op1 is a NaN
+  bhi     LOCAL_LABEL(NaN)
+
+  // Now we've ruled out NaNs, so we can just compare the two input registers
+  // and return. On this path we _don't_ need to check for the special case of
+  // comparing two zeroes, because we only came here if the bitwise OR of the
+  // exponent fields was 0x7FF, which means the exponents can't both have been
+  // zero! So we can _just_ do the reversed CMP and finish.
+  cmp     op1h, op0h
+  ReturnResult
diff --git a/compiler-rt/lib/builtins/arm/thumb1/gedf2.S b/compiler-rt/lib/builtins/arm/thumb1/gedf2.S
new file mode 100644
index 0000000000000..3673f24b5a160
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/gedf2.S
@@ -0,0 +1,60 @@
+//===-- gedf2.S - double-precision floating point comparison --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpdf2, except for its NaN
+// handling. It's a three-way compare which returns <0 if x<y, 0 if x==y, and
+// >0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it
+// returns <0, where __cmpdf2 would return >0.
+//
+// This also makes it suitable for use as __gtdf2 or __gedf2 (or __eqdf2 or
+// __nedf2).
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+#include "../crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0h .req xh
+op0l .req xl
+op1h .req yh
+op1l .req yl
+.macro ReturnResult
+  bhi 0f
+  blo 1f
+  movs r0, #0
+  // This macro is always called immediately before returning from the
+  // function, so it's safe to use the same return instruction here, instead of
+  // wasting time branching forward to the end of the macro.
+  pop     {r4,r5,r6,pc}
+0:
+  movs r0, #1
+  pop     {r4,r5,r6,pc}
+1:
+  movs r0, #1
+  rsbs r0, r0, #0
+  pop     {r4,r5,r6,pc}
+.endm
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gedf2, __compiler_rt_softfp_gedf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtdf2, __gedf2)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_gedf2)
+  #include "dcmp.h"
+
+LOCAL_LABEL(NaN):
+  movs r0, #1
+  rsbs r0, r0, #0
+  pop     {r4,r5,r6,pc}
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gedf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/unorddf2.S b/compiler-rt/lib/builtins/arm/thumb1/unorddf2.S
new file mode 100644
index 0000000000000..d3f4a1d4f27e9
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/unorddf2.S
@@ -0,0 +1,60 @@
+//===-- unorddf2.S - double-precision floating point comparison -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Return 1 if the result of comparing x with y is 'unordered', i.e.
+// one of x and y is NaN.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+#include "../crt_endian.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__unorddf2, __aeabi_dcmpun)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_dcmpun)
+
+  // This function isn't based on the general-purpose code in dcmp.h, because
+  // it's more effort than needed. Here we just need to identify whether or not
+  // there's at least one NaN in the inputs. There's no need to vary that check
+  // based on the sign bit, so we might as well just do the NaN test as quickly
+  // as possible.
+  //
+  // We do this by shifting the high word of each input left to get rid of the
+  // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in
+  // the low word. Then we check if the result is _greater_ than 0xFFE00000
+  // (but not equal), via adding 0x00200000 to it and testing for the HI
+  // condition (carry flag set, but Z clear).
+  //
+  // Once we've done that transformation to the first input xh:xl, we
+  // free up xl to contain our constant 0x00200000, so there's no need
+  // to push any registers.
+  cmp     xl, #1                // set C if xl is nonzero
+  adcs    xh, xh, xh            // shift xh left, bringing in the C bit
+  movs    xl, #1                // now xl is free, make the test constant
+  lsls    xl, xl, #21           //   by shifting 1 left to make 0x00200000
+  cmn     xh, xl                // HI if x is a NaN
+  bhi     LOCAL_LABEL(NaN)
+  cmp     yl, #1                // set C if yl is nonzero
+  adcs    yh, yh, yh            // shift yh left, bringing in the C bit
+  cmn     yh, xl                // HI if y is a NaN
+  bhi     LOCAL_LABEL(NaN)
+
+  movs    r0, #0
+  bx      lr
+
+LOCAL_LABEL(NaN):
+  movs    r0, #1
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_dcmpun)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/unorddf2.S b/compiler-rt/lib/builtins/arm/unorddf2.S
new file mode 100644
index 0000000000000..8816d4073b4f3
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/unorddf2.S
@@ -0,0 +1,71 @@
+//===-- unorddf2.S - double-precision floating point comparison -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Return 1 if the result of comparing x with y is 'unordered', i.e.
+// one of x and y is NaN.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+#include "crt_endian.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__unorddf2)
+  push {r4, lr}
+  VMOV_FROM_DOUBLE(r0, r1, d0)
+  VMOV_FROM_DOUBLE(r2, r3, d1)
+  bl __aeabi_dcmpun
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__unorddf2, __aeabi_dcmpun)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_dcmpun)
+
+  // This function isn't based on the general-purpose code in dcmp.h, because
+  // it's more effort than needed. Here we just need to identify whether or not
+  // there's at least one NaN in the inputs. There's no need to vary that check
+  // based on the sign bit, so we might as well just do the NaN test as quickly
+  // as possible.
+  //
+  // We do this by shifting the high word of each input left to get rid of the
+  // sign bit, shifting a bit in at the bottom which is 1 if any bit is set in
+  // the low word. Then we check if the result is _greater_ than 0xFFE00000
+  // (but not equal), via adding 0x00200000 to it and testing for the HI
+  // condition (carry flag set, but Z clear).
+  //
+  // Once we've done that transformation to the first input xh:xl, we
+  // free up xl to contain our constant 0x00200000, so there's no need
+  // to push any registers.
+  cmp     xl, #1                // set C if xl is nonzero
+  adc     xh, xh, xh            // shift xh left, bringing in the C bit
+  cmp     yl, #1                // set C if yl is nonzero
+  adc     yh, yh, yh            // shift yh left, bringing in the C bit
+  cmn     xh, #1 << 21          // if HI, then x is a NaN
+  cmnls   yh, #1 << 21          // if not HI, then do the same check for y
+
+  // If LS, then we have no NaNs and return false. We do this as quickly as we
+  // can (not stopping to take two instructions setting up r0 for both
+  // possibilities), on the assumption that NaNs are rare and we want to
+  // optimize for the non-NaN path.
+  movls   r0, #0
+  bxls    lr
+
+  // Otherwise, we have at least one NaN, and return true.
+  mov     r0, #1
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_dcmpun)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/comparedf2new_test.c b/compiler-rt/test/builtins/Unit/comparedf2new_test.c
new file mode 100644
index 0000000000000..d337cb3db4b0f
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/comparedf2new_test.c
@@ -0,0 +1,619 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_comparedf2
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+COMPILER_RT_ABI int __eqdf2(double, double);
+COMPILER_RT_ABI int __nedf2(double, double);
+COMPILER_RT_ABI int __gedf2(double, double);
+COMPILER_RT_ABI int __gtdf2(double, double);
+COMPILER_RT_ABI int __ledf2(double, double);
+COMPILER_RT_ABI int __ltdf2(double, double);
+COMPILER_RT_ABI int __cmpdf2(double, double);
+COMPILER_RT_ABI int __unorddf2(double, double);
+
+enum Result { RESULT_LT, RESULT_GT, RESULT_EQ, RESULT_UN };
+
+int expect(uint64_t a_rep, uint64_t b_rep, const char *name, int result, int ok,
+           const char *expected, int line) {
+  if (!ok)
+    printf("error at line %d: %s(%016" PRIx64 ", %016" PRIx64
+           ") = %d, expected %s\n",
+           line, name, a_rep, b_rep, result, expected);
+  return !ok;
+}
+
+int test__comparedf2(uint64_t a_rep, uint64_t b_rep, enum Result result,
+                     int line) {
+  double a = fromRep64(a_rep), b = fromRep64(b_rep);
+
+  int eq = __eqdf2(a, b);
+  int ne = __nedf2(a, b);
+  int ge = __gedf2(a, b);
+  int gt = __gtdf2(a, b);
+  int le = __ledf2(a, b);
+  int lt = __ltdf2(a, b);
+#ifdef __ELF__
+  // The generic builtins/comparedf2.c does not define this function
+  // for object formats other than ELF
+  int cmp = __cmpdf2(a, b);
+#endif
+  int unord = __unorddf2(a, b);
+
+  int ret = 0;
+
+  switch (result) {
+  case RESULT_LT:
+    ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__nedf2", ne, ne != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__gedf2", ge, ge < 0, "< 0", line);
+    ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ledf2", le, le <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt < 0, "< 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == -1, "== -1", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 0, "== 0", line);
+    break;
+  case RESULT_GT:
+    ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__nedf2", ne, ne != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__gedf2", ge, ge >= 0, ">= 0", line);
+    ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt > 0, "> 0", line);
+    ret |= expect(a_rep, b_rep, "__ledf2", le, le > 0, "> 0", line);
+    ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt >= 0, ">= 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == 1, "== 1", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 0, "== 0", line);
+    break;
+  case RESULT_EQ:
+    ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq == 0, "== 0", line);
+    ret |= expect(a_rep, b_rep, "__nedf2", ne, ne == 0, "== 0", line);
+    ret |= expect(a_rep, b_rep, "__gedf2", ge, ge >= 0, ">= 0", line);
+    ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ledf2", le, le <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt >= 0, ">= 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == 0, "== 0", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 0, "== 0", line);
+    break;
+  case RESULT_UN:
+    ret |= expect(a_rep, b_rep, "__eqdf2", eq, eq != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__nedf2", ne, ne != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__gedf2", ge, ge < 0, "< 0", line);
+    ret |= expect(a_rep, b_rep, "__gtdf2", gt, gt <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ledf2", le, le > 0, "> 0", line);
+    ret |= expect(a_rep, b_rep, "__ltdf2", lt, lt >= 0, ">= 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpdf2", cmp, cmp == 1, "== 1", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unorddf2", unord, unord == 1, "== 1", line);
+    break;
+  }
+
+  return ret;
+}
+
+#define test__comparedf2(a, b, x) test__comparedf2(a, b, x, __LINE__)
+
+int main(void) {
+  int status = 0;
+
+  status |= test__comparedf2(0x0000000000000000, 0x0000000000000001, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000000, 0x000fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000000, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000000, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000000, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000000, 0x7ff00000a5a42e09, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000000, 0x7ffcd5b95f9b89ae, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000000, 0x7ffcd5b95f9b89ae, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000000, 0x8000000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x0000000000000000, 0x8000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000000, 0x800fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000000, 0x8010000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000000, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000000, 0xfff00000a5a42e09, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000000, 0xfffcd5b95f9b89ae, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000000, 0xfffcd5b95f9b89ae, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000001, 0x0000000000000001, RESULT_EQ);
+  status |= test__comparedf2(0x0000000000000001, 0x3fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x3ffffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x3fffffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x7fdfffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x7feffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x7fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000001, 0x7ff00000887bcf03, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000001, 0x7ff753b1887bcf03, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000001, 0x7ffc3134b058fe20, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000001, 0x8000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xbfefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xbffffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xbfffffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xffdfffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xffeffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xffefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000001, 0xfff00000887bcf03, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000001, 0xfff753b1887bcf03, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000001, 0xfffc3134b058fe20, RESULT_UN);
+  status |= test__comparedf2(0x0000000000000002, 0x0000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000003, 0x0000000000000002, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000003, 0x4008000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000003, 0x4014000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000003, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0000000000000003, 0xc014000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000003, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0000000000000004, 0x0000000000000004, RESULT_EQ);
+  status |= test__comparedf2(0x000ffffffffffffc, 0x800ffffffffffffc, RESULT_GT);
+  status |= test__comparedf2(0x000ffffffffffffd, 0x000ffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0x000fffffffffffff, 0x0000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x000fffffffffffff, 0x000ffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x000fffffffffffff, 0x000fffffffffffff, RESULT_EQ);
+  status |= test__comparedf2(0x000fffffffffffff, 0x0010000000000000, RESULT_LT);
+  status |= test__comparedf2(0x000fffffffffffff, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x000fffffffffffff, 0x7ff00000dfe15ee3, RESULT_UN);
+  status |= test__comparedf2(0x000fffffffffffff, 0x7ff6d1ebdfe15ee3, RESULT_UN);
+  status |= test__comparedf2(0x000fffffffffffff, 0x7ffed0664505a878, RESULT_UN);
+  status |= test__comparedf2(0x000fffffffffffff, 0x8000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x000fffffffffffff, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x000fffffffffffff, 0xfff00000dfe15ee3, RESULT_UN);
+  status |= test__comparedf2(0x000fffffffffffff, 0xfff6d1ebdfe15ee3, RESULT_UN);
+  status |= test__comparedf2(0x000fffffffffffff, 0xfffed0664505a878, RESULT_UN);
+  status |= test__comparedf2(0x0010000000000000, 0x0000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0010000000000000, 0x0010000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x0010000000000000, 0x8010000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0010000000000001, 0x0010000000000000, RESULT_GT);
+  status |= test__comparedf2(0x0010000000000001, 0x0010000000000002, RESULT_LT);
+  status |= test__comparedf2(0x001fffffffffffff, 0x0020000000000000, RESULT_LT);
+  status |= test__comparedf2(0x001fffffffffffff, 0x0020000000000002, RESULT_LT);
+  status |= test__comparedf2(0x001fffffffffffff, 0x0020000000000004, RESULT_LT);
+  status |= test__comparedf2(0x0020000000000000, 0x001fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0020000000000001, 0x0010000000000001, RESULT_GT);
+  status |= test__comparedf2(0x0020000000000001, 0x001fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0020000000000002, 0x0010000000000001, RESULT_GT);
+  status |= test__comparedf2(0x002fffffffffffff, 0x0030000000000000, RESULT_LT);
+  status |= test__comparedf2(0x0030000000000000, 0x002fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0030000000000001, 0x002fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x0030000000000002, 0x0020000000000003, RESULT_GT);
+  status |= test__comparedf2(0x3fe0000000000000, 0x3fe0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x3fefffffffffffff, 0x0000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x3fefffffffffffff, 0x8000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x3ff0000000000000, 0x3ff0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x3ff0000000000000, 0x3ff0000000000003, RESULT_LT);
+  status |= test__comparedf2(0x3ff0000000000000, 0x4000000000000000, RESULT_LT);
+  status |= test__comparedf2(0x3ff0000000000000, 0x401c000000000000, RESULT_LT);
+  status |= test__comparedf2(0x3ff0000000000000, 0x7ff0000033022725, RESULT_UN);
+  status |= test__comparedf2(0x3ff0000000000000, 0x7ff4f5ad33022725, RESULT_UN);
+  status |= test__comparedf2(0x3ff0000000000000, 0x7ffd3870667efc9d, RESULT_UN);
+  status |= test__comparedf2(0x3ff0000000000000, 0x8000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x3ff0000000000000, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x3ff0000000000000, 0xbff0000000000003, RESULT_GT);
+  status |= test__comparedf2(0x3ff0000000000000, 0xfff0000033022725, RESULT_UN);
+  status |= test__comparedf2(0x3ff0000000000000, 0xfff4f5ad33022725, RESULT_UN);
+  status |= test__comparedf2(0x3ff0000000000000, 0xfffd3870667efc9d, RESULT_UN);
+  status |= test__comparedf2(0x3ff0000000000001, 0x3ff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x3ff0000000000001, 0x3ff0000000000002, RESULT_LT);
+  status |= test__comparedf2(0x3ff0000000000001, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x3ffffffffffffffc, 0x3ffffffffffffffd, RESULT_LT);
+  status |= test__comparedf2(0x3fffffffffffffff, 0x0000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x3fffffffffffffff, 0x4000000000000000, RESULT_LT);
+  status |= test__comparedf2(0x4000000000000000, 0x3ff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000000, 0x3fffffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000000, 0x4000000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x4000000000000000, 0x4000000000000001, RESULT_LT);
+  status |= test__comparedf2(0x4000000000000000, 0xc000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000000, 0xc000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000000, 0xc014000000000000, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000001, 0x3ff0000000000001, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000001, 0x4000000000000002, RESULT_LT);
+  status |= test__comparedf2(0x4000000000000001, 0xc000000000000002, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000002, 0x3ff0000000000001, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000002, 0x3ff0000000000003, RESULT_GT);
+  status |= test__comparedf2(0x4000000000000004, 0x4000000000000003, RESULT_GT);
+  status |= test__comparedf2(0x4008000000000000, 0x4008000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x400fffffffffffff, 0x400ffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x400fffffffffffff, 0x4010000000000002, RESULT_LT);
+  status |= test__comparedf2(0x4010000000000001, 0x400fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x4014000000000000, 0x0000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x4014000000000000, 0x8000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x4014000000000000, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x4014000000000000, 0xc014000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fb0000000000001, 0x7fafffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7fcfffffffffffff, 0x7fcffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x7fcfffffffffffff, 0x7fd0000000000002, RESULT_LT);
+  status |= test__comparedf2(0x7fd0000000000000, 0x7fcfffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7fd0000000000000, 0x7fd0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x7fd0000000000000, 0x7fd0000000000001, RESULT_LT);
+  status |= test__comparedf2(0x7fd0000000000001, 0x7fd0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fd0000000000001, 0x7fe0000000000001, RESULT_LT);
+  status |= test__comparedf2(0x7fd0000000000001, 0xffd0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fd0000000000002, 0x7fc0000000000003, RESULT_GT);
+  status |= test__comparedf2(0x7fd0000000000004, 0x7fd0000000000003, RESULT_GT);
+  status |= test__comparedf2(0x7fdffffffffffffe, 0x7fdffffffffffffe, RESULT_EQ);
+  status |= test__comparedf2(0x7fdffffffffffffe, 0x7fdfffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x7fdffffffffffffe, 0xffdfffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7fdfffffffffffff, 0x3ff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fdfffffffffffff, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x7fdfffffffffffff, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fdfffffffffffff, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000000, 0x3ff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000000, 0x7fe0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x7fe0000000000000, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x7fe0000000000000, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000000, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000000, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000001, 0x7fe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000001, 0x7fe0000000000002, RESULT_LT);
+  status |= test__comparedf2(0x7fe0000000000001, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fe0000000000002, 0x7fd0000000000001, RESULT_GT);
+  status |= test__comparedf2(0x7feffffffffffffe, 0x3ff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7feffffffffffffe, 0x7fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x7feffffffffffffe, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7feffffffffffffe, 0xffefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x0000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x3ff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x7fefffffffffffff, RESULT_EQ);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x7ff00000c901461b, RESULT_UN);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x7ff784a9c901461b, RESULT_UN);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x7ffe2c1db2e4a313, RESULT_UN);
+  status |= test__comparedf2(0x7fefffffffffffff, 0x8000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x7fefffffffffffff, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7fefffffffffffff, 0xfff00000c901461b, RESULT_UN);
+  status |= test__comparedf2(0x7fefffffffffffff, 0xfff784a9c901461b, RESULT_UN);
+  status |= test__comparedf2(0x7fefffffffffffff, 0xfffe2c1db2e4a313, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000000000000, 0x0000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x0000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x000fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x7fe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x7fefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x7ff0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x7ff0000000000000, 0x7ff0e6d059ac9171, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000000000000, 0x7ffbda2fc9024ae6, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000000000000, 0x8000000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x8000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0x800fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0xffefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000000000000, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x7ff0000047e8b9a0, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff4017647e8b9a0, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000abfe5d29, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff2a1cdabfe5d29, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff000005155db76, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff645cb5155db76, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000070c46aa0, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff2068470c46aa0, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000b5aee637, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff72b19b5aee637, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000c08c2788, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff1e0c1c08c2788, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000ec581a54, 0x7ff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000ec581a54, 0x7ff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff571eaec581a54, 0x7ff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff571eaec581a54, 0x7ff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff000003a3a1f94, 0x7ff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff000003a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff6439e3a3a1f94, 0x7ff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff6439e3a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000ec581a54, 0xfff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000ec581a54, 0xfff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff571eaec581a54, 0xfff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff571eaec581a54, 0xfff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0x7ff000003a3a1f94, 0xfff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff000003a3a1f94, 0xfffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff6439e3a3a1f94, 0xfff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff6439e3a3a1f94, 0xfffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000c31d528e, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff5fb72c31d528e, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000ac81d215, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff4481aac81d215, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000d12062fd, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff707f6d12062fd, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff000001c6481ef, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff66ee91c6481ef, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000985729a7, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff19cff985729a7, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000053ec80fe, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff7dbc153ec80fe, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000816fb493, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff87f75816fb493, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff000000c2d7c33, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff91ecb0c2d7c33, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000a68bae40, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ffc0acda68bae40, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff000002fe14961, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ffcfa4e2fe14961, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff000005c206da1, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff800bb5c206da1, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000051887a34, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ffce11951887a34, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff000002b4c32a8, 0x7ff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ff000002b4c32a8, 0x7ff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0x7ff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0x7ff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000bc88c2a9, 0x7ff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000bc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0x7ff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff000002b4c32a8, 0xfff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ff000002b4c32a8, 0xfff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0xfff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ffbd6b52b4c32a8, 0xfff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000bc88c2a9, 0xfff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000bc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0xfff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff8eaadbc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000a47525ca, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ffcb028a47525ca, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000097c1af12, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ffc541e97c1af12, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000bb1c07a4, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff966b7bb1c07a4, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff000001d98f07c, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff9dbf61d98f07c, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff0000040e65504, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ffb2a7440e65504, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0x7ff00000d9dc7412, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x7ff8af62d9dc7412, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000000, 0x0000000000000000, RESULT_EQ);
+  status |= test__comparedf2(0x8000000000000000, 0x0000000000000001, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000000, 0x000fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000000, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000000, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000000, 0x7ff000005a0faea3, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000000, 0x7ff225cc5a0faea3, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000000, 0x7ffa0cc436ad9daa, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000000, 0x8000000000000001, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000000, 0x800fffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000000, 0x8010000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000000, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000000, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000000, 0xfff000005a0faea3, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000000, 0xfff225cc5a0faea3, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000000, 0xfffa0cc436ad9daa, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000001, 0x0000000000000001, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x3fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x3ffffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x3fffffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x7fdfffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x7feffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x7fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000001, 0x7ff0000013fd5944, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000001, 0x7ff4154313fd5944, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000001, 0x7ffd397ba0f9b5e1, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000001, 0x8000000000000001, RESULT_EQ);
+  status |= test__comparedf2(0x8000000000000001, 0xbfefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xbff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xbffffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xbfffffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xffdfffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xffeffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xffefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000001, 0xfff0000013fd5944, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000001, 0xfff4154313fd5944, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000001, 0xfffd397ba0f9b5e1, RESULT_UN);
+  status |= test__comparedf2(0x8000000000000002, 0x8000000000000001, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000003, 0x4008000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000003, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000003, 0x8000000000000002, RESULT_LT);
+  status |= test__comparedf2(0x8000000000000003, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8000000000000004, 0x8000000000000004, RESULT_EQ);
+  status |= test__comparedf2(0x800ffffffffffffd, 0x800ffffffffffffe, RESULT_GT);
+  status |= test__comparedf2(0x800fffffffffffff, 0x0000000000000000, RESULT_LT);
+  status |= test__comparedf2(0x800fffffffffffff, 0x000fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x800fffffffffffff, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0x800fffffffffffff, 0x7ff00000a2b85efa, RESULT_UN);
+  status |= test__comparedf2(0x800fffffffffffff, 0x7ff1d4fba2b85efa, RESULT_UN);
+  status |= test__comparedf2(0x800fffffffffffff, 0x7ffd08c114a37fe6, RESULT_UN);
+  status |= test__comparedf2(0x800fffffffffffff, 0x8000000000000000, RESULT_LT);
+  status |= test__comparedf2(0x800fffffffffffff, 0x800ffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0x800fffffffffffff, 0x800fffffffffffff, RESULT_EQ);
+  status |= test__comparedf2(0x800fffffffffffff, 0x8010000000000000, RESULT_GT);
+  status |= test__comparedf2(0x800fffffffffffff, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0x800fffffffffffff, 0xfff00000a2b85efa, RESULT_UN);
+  status |= test__comparedf2(0x800fffffffffffff, 0xfff1d4fba2b85efa, RESULT_UN);
+  status |= test__comparedf2(0x800fffffffffffff, 0xfffd08c114a37fe6, RESULT_UN);
+  status |= test__comparedf2(0x8010000000000000, 0x0000000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8010000000000000, 0x0010000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8010000000000001, 0x8010000000000000, RESULT_LT);
+  status |= test__comparedf2(0x8010000000000001, 0x8010000000000002, RESULT_GT);
+  status |= test__comparedf2(0x801fffffffffffff, 0x8020000000000000, RESULT_GT);
+  status |= test__comparedf2(0x801fffffffffffff, 0x8020000000000002, RESULT_GT);
+  status |= test__comparedf2(0x801fffffffffffff, 0x8020000000000004, RESULT_GT);
+  status |= test__comparedf2(0x8020000000000000, 0x801fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8020000000000001, 0x8010000000000001, RESULT_LT);
+  status |= test__comparedf2(0x8020000000000001, 0x801fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8020000000000002, 0x8010000000000001, RESULT_LT);
+  status |= test__comparedf2(0x802fffffffffffff, 0x8030000000000000, RESULT_GT);
+  status |= test__comparedf2(0x8030000000000000, 0x802fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8030000000000001, 0x802fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0x8030000000000002, 0x8020000000000003, RESULT_LT);
+  status |= test__comparedf2(0xbff0000000000000, 0x3ff0000000000003, RESULT_LT);
+  status |= test__comparedf2(0xbff0000000000000, 0x7ff000000d32ab76, RESULT_UN);
+  status |= test__comparedf2(0xbff0000000000000, 0x7ff3d46c0d32ab76, RESULT_UN);
+  status |= test__comparedf2(0xbff0000000000000, 0x7ffb51e7ffa1e86b, RESULT_UN);
+  status |= test__comparedf2(0xbff0000000000000, 0x8000000000000000, RESULT_LT);
+  status |= test__comparedf2(0xbff0000000000000, 0xbff0000000000003, RESULT_GT);
+  status |= test__comparedf2(0xbff0000000000000, 0xfff000000d32ab76, RESULT_UN);
+  status |= test__comparedf2(0xbff0000000000000, 0xfff3d46c0d32ab76, RESULT_UN);
+  status |= test__comparedf2(0xbff0000000000000, 0xfffb51e7ffa1e86b, RESULT_UN);
+  status |= test__comparedf2(0xbff0000000000001, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xbff0000000000001, 0xbff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xbff0000000000001, 0xbff0000000000002, RESULT_GT);
+  status |= test__comparedf2(0xbffffffffffffffc, 0xbffffffffffffffd, RESULT_GT);
+  status |= test__comparedf2(0xbfffffffffffffff, 0x0000000000000001, RESULT_LT);
+  status |= test__comparedf2(0xbfffffffffffffff, 0xc000000000000000, RESULT_GT);
+  status |= test__comparedf2(0xc000000000000000, 0x4000000000000001, RESULT_LT);
+  status |= test__comparedf2(0xc000000000000000, 0xbfffffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xc000000000000000, 0xc000000000000001, RESULT_GT);
+  status |= test__comparedf2(0xc000000000000001, 0x4000000000000002, RESULT_LT);
+  status |= test__comparedf2(0xc000000000000001, 0xbff0000000000001, RESULT_LT);
+  status |= test__comparedf2(0xc000000000000001, 0xc000000000000002, RESULT_GT);
+  status |= test__comparedf2(0xc000000000000002, 0xbff0000000000001, RESULT_LT);
+  status |= test__comparedf2(0xc000000000000002, 0xbff0000000000003, RESULT_LT);
+  status |= test__comparedf2(0xc000000000000004, 0xc000000000000003, RESULT_LT);
+  status |= test__comparedf2(0xc008000000000000, 0x4008000000000000, RESULT_LT);
+  status |= test__comparedf2(0xc00fffffffffffff, 0xc00ffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0xc00fffffffffffff, 0xc010000000000002, RESULT_GT);
+  status |= test__comparedf2(0xc010000000000001, 0xc00fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xffb0000000000001, 0xffafffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xffcfffffffffffff, 0xffcffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0xffcfffffffffffff, 0xffd0000000000002, RESULT_GT);
+  status |= test__comparedf2(0xffd0000000000000, 0xffcfffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xffd0000000000000, 0xffd0000000000001, RESULT_GT);
+  status |= test__comparedf2(0xffd0000000000001, 0x7fd0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffd0000000000001, 0xffd0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffd0000000000001, 0xffe0000000000001, RESULT_GT);
+  status |= test__comparedf2(0xffd0000000000002, 0xffc0000000000003, RESULT_LT);
+  status |= test__comparedf2(0xffd0000000000004, 0xffd0000000000003, RESULT_LT);
+  status |= test__comparedf2(0xffdffffffffffffe, 0x7fdffffffffffffe, RESULT_LT);
+  status |= test__comparedf2(0xffdffffffffffffe, 0x7fdfffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xffdffffffffffffe, 0xffdffffffffffffe, RESULT_EQ);
+  status |= test__comparedf2(0xffdffffffffffffe, 0xffdfffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0xffdfffffffffffff, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffdfffffffffffff, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffdfffffffffffff, 0xbff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffdfffffffffffff, 0xffe0000000000000, RESULT_GT);
+  status |= test__comparedf2(0xffe0000000000000, 0x0000000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000000, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000000, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000000, 0x8000000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000000, 0xbff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000000, 0xffe0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0xffe0000000000000, 0xfff0000000000000, RESULT_GT);
+  status |= test__comparedf2(0xffe0000000000001, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000001, 0xffe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffe0000000000001, 0xffe0000000000002, RESULT_GT);
+  status |= test__comparedf2(0xffe0000000000002, 0xffd0000000000001, RESULT_LT);
+  status |= test__comparedf2(0xffeffffffffffffe, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffeffffffffffffe, 0x7fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xffeffffffffffffe, 0xbff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffeffffffffffffe, 0xffefffffffffffff, RESULT_GT);
+  status |= test__comparedf2(0xffefffffffffffff, 0x0000000000000001, RESULT_LT);
+  status |= test__comparedf2(0xffefffffffffffff, 0x3ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffefffffffffffff, 0x7ff000007d4a42a6, RESULT_UN);
+  status |= test__comparedf2(0xffefffffffffffff, 0x7ff7252c7d4a42a6, RESULT_UN);
+  status |= test__comparedf2(0xffefffffffffffff, 0x7ff980ec6115c6fb, RESULT_UN);
+  status |= test__comparedf2(0xffefffffffffffff, 0x8000000000000001, RESULT_LT);
+  status |= test__comparedf2(0xffefffffffffffff, 0xbff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xffefffffffffffff, 0xffefffffffffffff, RESULT_EQ);
+  status |= test__comparedf2(0xffefffffffffffff, 0xfff000007d4a42a6, RESULT_UN);
+  status |= test__comparedf2(0xffefffffffffffff, 0xfff7252c7d4a42a6, RESULT_UN);
+  status |= test__comparedf2(0xffefffffffffffff, 0xfff980ec6115c6fb, RESULT_UN);
+  status |= test__comparedf2(0xfff0000000000000, 0x0000000000000000, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x0000000000000001, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x000fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x7fe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x7fefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x7ff0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x7ff00000578bbe24, RESULT_UN);
+  status |= test__comparedf2(0xfff0000000000000, 0x7ff63d54578bbe24, RESULT_UN);
+  status |= test__comparedf2(0xfff0000000000000, 0x7ffbc66614390083, RESULT_UN);
+  status |= test__comparedf2(0xfff0000000000000, 0x8000000000000000, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x8000000000000001, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0x800fffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0xffe0000000000000, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0xffefffffffffffff, RESULT_LT);
+  status |= test__comparedf2(0xfff0000000000000, 0xfff0000000000000, RESULT_EQ);
+  status |= test__comparedf2(0xfff0000000000000, 0xfff00000578bbe24, RESULT_UN);
+  status |= test__comparedf2(0xfff0000000000000, 0xfff63d54578bbe24, RESULT_UN);
+  status |= test__comparedf2(0xfff0000000000000, 0xfffbc66614390083, RESULT_UN);
+  status |= test__comparedf2(0xfff0000047e8b9a0, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff4017647e8b9a0, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff00000abfe5d29, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff2a1cdabfe5d29, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff000005155db76, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff645cb5155db76, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff0000070c46aa0, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff2068470c46aa0, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff00000b5aee637, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff72b19b5aee637, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff00000c08c2788, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff1e0c1c08c2788, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff00000ec581a54, 0x7ff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff00000ec581a54, 0x7ff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff571eaec581a54, 0x7ff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff571eaec581a54, 0x7ff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff000003a3a1f94, 0x7ff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff000003a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff6439e3a3a1f94, 0x7ff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff6439e3a3a1f94, 0x7ffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff00000ec581a54, 0xfff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff00000ec581a54, 0xfff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff571eaec581a54, 0xfff0000021ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff571eaec581a54, 0xfff45d2221ebdfaf, RESULT_UN);
+  status |= test__comparedf2(0xfff000003a3a1f94, 0xfff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff000003a3a1f94, 0xfffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff6439e3a3a1f94, 0xfff00000229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff6439e3a3a1f94, 0xfffb8fa0229f3502, RESULT_UN);
+  status |= test__comparedf2(0xfff00000c31d528e, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff5fb72c31d528e, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff00000ac81d215, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff4481aac81d215, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff00000d12062fd, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff707f6d12062fd, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff000001c6481ef, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff66ee91c6481ef, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff00000985729a7, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff19cff985729a7, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff0000053ec80fe, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff7dbc153ec80fe, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff00000816fb493, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff87f75816fb493, 0x0000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff000000c2d7c33, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff91ecb0c2d7c33, 0x0000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff00000a68bae40, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfffc0acda68bae40, 0x000fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff000002fe14961, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfffcfa4e2fe14961, 0x3ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff000005c206da1, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff800bb5c206da1, 0x7fefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff0000051887a34, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfffce11951887a34, 0x7ff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff000002b4c32a8, 0x7ff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfff000002b4c32a8, 0x7ff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfffbd6b52b4c32a8, 0x7ff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfffbd6b52b4c32a8, 0x7ff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfff00000bc88c2a9, 0x7ff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff00000bc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff8eaadbc88c2a9, 0x7ff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff8eaadbc88c2a9, 0x7ffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff000002b4c32a8, 0xfff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfff000002b4c32a8, 0xfff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfffbd6b52b4c32a8, 0xfff000001edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfffbd6b52b4c32a8, 0xfff342ea1edb8786, RESULT_UN);
+  status |= test__comparedf2(0xfff00000bc88c2a9, 0xfff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff00000bc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff8eaadbc88c2a9, 0xfff000002fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff8eaadbc88c2a9, 0xfffdc9ee2fa062f4, RESULT_UN);
+  status |= test__comparedf2(0xfff00000a47525ca, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfffcb028a47525ca, 0x8000000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff0000097c1af12, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfffc541e97c1af12, 0x8000000000000001, RESULT_UN);
+  status |= test__comparedf2(0xfff00000bb1c07a4, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff966b7bb1c07a4, 0x800fffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff000001d98f07c, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff9dbf61d98f07c, 0xbff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff0000040e65504, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfffb2a7440e65504, 0xffefffffffffffff, RESULT_UN);
+  status |= test__comparedf2(0xfff00000d9dc7412, 0xfff0000000000000, RESULT_UN);
+  status |= test__comparedf2(0xfff8af62d9dc7412, 0xfff0000000000000, RESULT_UN);
+
+  return status;
+}

From c0ed919396de9ea18c6a247a9ecf42c38af23a37 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Thu, 14 May 2026 18:05:21 +0900
Subject: [PATCH 45/95] [AMDGPU][GCNPreRAOptimizations] Reduce BVH premature
 reuse (#197386)

Add implicit uses to ds_bvh_stack instructions to avoid reuse of VGPRs
allocated to bvh_intersect_ray results prior to ds_bvh_stack. This
reduces likelihood of a premature s_wait_bvhcnt occuring due to partial
reallocation of unused bvh_intersect_ray results registers.
---
 .../Target/AMDGPU/GCNPreRAOptimizations.cpp   | 100 ++++--
 .../AMDGPU/optimize-ds-bvh-stack-pre-ra.ll    | 300 ++++++++++++++++++
 2 files changed, 377 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll

diff --git a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
index cd56887fd46a8..825634d7af65b 100644
--- a/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
@@ -53,6 +53,8 @@ class GCNPreRAOptimizationsImpl {
   LiveIntervals *LIS;
 
   bool processReg(Register Reg);
+  void hintTrue16Copy(const MachineInstr &MI);
+  bool optimizeBVHStack(MachineInstr &MI);
 
 public:
   GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
@@ -238,6 +240,65 @@ GCNPreRAOptimizationsPass::run(MachineFunction &MF,
   return PreservedAnalyses::all();
 }
 
+void GCNPreRAOptimizationsImpl::hintTrue16Copy(const MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst);
+  bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC);
+  if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() &&
+      TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
+    MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
+  if (Src.isVirtual() && MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
+      Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass)
+    MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
+  if (!Dst.isVirtual() || !Src.isVirtual())
+    return;
+  if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
+      MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
+    MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
+    MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
+  }
+  if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
+    MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
+}
+
+bool GCNPreRAOptimizationsImpl::optimizeBVHStack(MachineInstr &MI) {
+  SmallVector<Register, 2> UseRegs;
+
+  // Find BVH sources for this DS_BVH_STACK instruction.
+  auto CheckUse = [&](MachineOperand &Use) {
+    Register Reg = Use.getReg();
+    for (const MachineInstr &Src : MRI->def_instructions(Reg)) {
+      if (!SIInstrInfo::isImage(Src))
+        continue;
+      const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Src.getOpcode());
+      const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
+          AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
+      if (!BaseInfo->BVH)
+        continue;
+      UseRegs.push_back(Reg);
+      break;
+    }
+  };
+  CheckUse(*TII->getNamedOperand(MI, AMDGPU::OpName::data0));
+  CheckUse(*TII->getNamedOperand(MI, AMDGPU::OpName::data1));
+
+  if (UseRegs.empty())
+    return false;
+
+  // Add implicit uses for entire BVH source registers.
+  // This avoids partial reallocation of register which could
+  // introduce a premature s_wait_bvhcnt.
+  for (Register Reg : UseRegs) {
+    MI.addOperand(MachineOperand::CreateReg(Reg, false, true));
+    LIS->removeInterval(Reg);
+    LIS->createAndComputeVirtRegInterval(Reg);
+  }
+  LLVM_DEBUG(dbgs() << "Added implicit uses to: " << MI);
+
+  return true;
+}
+
 bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -258,34 +319,27 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
     Changed |= processReg(Reg);
   }
 
-  if (!ST.useRealTrue16Insts())
+  const bool HasBVHStack = ST.hasBVHDualAndBVH8Insts();
+  const bool HasRealTrue16 = ST.useRealTrue16Insts();
+
+  if (!HasRealTrue16 && !HasBVHStack)
     return Changed;
 
-  // Add RA hints to improve True16 COPY elimination.
-  for (const MachineBasicBlock &MBB : MF) {
-    for (const MachineInstr &MI : MBB) {
-      if (MI.getOpcode() != AMDGPU::COPY)
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // Add RA hints to improve True16 COPY elimination.
+      if (HasRealTrue16 && MI.getOpcode() == AMDGPU::COPY) {
+        hintTrue16Copy(MI);
         continue;
-      Register Dst = MI.getOperand(0).getReg();
-      Register Src = MI.getOperand(1).getReg();
-      const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, Dst);
-      bool IsDst16Bit = AMDGPU::VGPR_16RegClass.hasSubClassEq(DstRC);
-      if (Dst.isVirtual() && IsDst16Bit && Src.isPhysical() &&
-          TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
-      if (Src.isVirtual() &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
-          Dst.isPhysical() && DstRC == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
-      if (!Dst.isVirtual() || !Src.isVirtual())
+      }
+      // Add implicit uses to avoid early wait on intersect ray instructions.
+      if (HasBVHStack &&
+          (MI.getOpcode() == AMDGPU::DS_BVH_STACK_RTN_B32 ||
+           MI.getOpcode() == AMDGPU::DS_BVH_STACK_PUSH8_POP1_RTN_B32 ||
+           MI.getOpcode() == AMDGPU::DS_BVH_STACK_PUSH8_POP2_RTN_B64)) {
+        Changed |= optimizeBVHStack(MI);
         continue;
-      if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
-          MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
-        MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
       }
-      if (IsDst16Bit && MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
-        MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
     }
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll b/llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll
new file mode 100644
index 0000000000000..be351ea026a03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/optimize-ds-bvh-stack-pre-ra.ll
@@ -0,0 +1,300 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s
+
+define amdgpu_gs void @test_ds_bvh_stack_push4_pop1(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) {
+; CHECK-LABEL: test_ds_bvh_stack_push4_pop1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6
+; CHECK-NEXT:    v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT:    v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21
+; CHECK-NEXT:    v_mov_b32_e32 v31, v20
+; CHECK-NEXT:    image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
+; CHECK-NEXT:    v_cmpx_eq_f32_e32 0, v20
+; CHECK-NEXT:    s_cbranch_execz .LBB0_2
+; CHECK-NEXT:  ; %bb.1: ; %if
+; CHECK-NEXT:    global_load_b64 v[6:7], v[12:13], off
+; CHECK-NEXT:    global_load_b64 v[34:35], v[14:15], off
+; CHECK-NEXT:    global_load_b64 v[36:37], v[16:17], off
+; CHECK-NEXT:    global_load_b64 v[38:39], v[18:19], off
+; CHECK-NEXT:    s_wait_loadcnt 0x2
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, v7, v35
+; CHECK-NEXT:    s_wait_loadcnt 0x1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v34, v36
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v1, v1, v37, v39
+; CHECK-NEXT:    v_add3_u32 v1, v6, v38, v1
+; CHECK-NEXT:  .LBB0_2: ; %end
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    ds_bvh_stack_push4_pop1_rtn_b32 v1, v0, v1, v[21:24]
+; CHECK-NEXT:    image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    global_store_b32 v[12:13], v1, off
+; CHECK-NEXT:    global_store_b32 v[14:15], v0, off
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[16:17], v20, off
+; CHECK-NEXT:    global_store_b32 v[18:19], v21, off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr)
+  %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
+  %val.0 = extractelement <10 x i32> %a, i32 0
+  %val.1 = extractelement <10 x i32> %a, i32 1
+  %val.2 = extractelement <10 x i32> %a, i32 2
+  %val.3 = extractelement <10 x i32> %a, i32 3
+  %bvh.0 = insertelement <4 x i32> poison, i32 %val.0, i32 0
+  %bvh.1 = insertelement <4 x i32> %bvh.0, i32 %val.1, i32 1
+  %bvh.2 = insertelement <4 x i32> %bvh.1, i32 %val.2, i32 2
+  %bvh = insertelement <4 x i32> %bvh.2, i32 %val.3, i32 3
+  %cnd = fcmp oeq float %ray_origin_x, 0.0
+  br i1 %cnd, label %if, label %end
+
+if:
+  ; loads to force vgpr pressure
+  %load.0 = load <2 x i32>, ptr addrspace(1) %p.0
+  %load.1 = load <2 x i32>, ptr addrspace(1) %p.1
+  %load.2 = load <2 x i32>, ptr addrspace(1) %p.2
+  %load.3 = load <2 x i32>, ptr addrspace(1) %p.3
+  %add.0 = add <2 x i32> %load.0, %load.1
+  %add.1 = add <2 x i32> %add.0, %load.2
+  %add.2 = add <2 x i32> %add.1, %load.3
+  %.i0 = extractelement <2 x i32> %add.2, i32 0
+  %.i1 = extractelement <2 x i32> %add.2, i32 1
+  %data.1 = add i32 %.i0, %.i1
+  br label %end
+
+end:
+  %data = phi i32 [ %data.0, %entry ], [ %data.1, %if ]
+  %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(i32 %addr, i32 %data, <4 x i32> %bvh, i32 0)
+  %vdst = extractvalue { i32, i32 } %pair, 0
+  %newaddr = extractvalue { i32, i32 } %pair, 1
+
+  ; keep all intersect ray parameters live
+  %new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
+  %new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
+  %v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr)
+  %b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0
+  %c = extractelement <10 x i32> %b, i32 0
+  %d = extractelement <10 x i32> %b, i32 1
+
+  ; stores keep pointers live
+  store i32 %vdst, ptr addrspace(1) %p.0
+  store i32 %newaddr, ptr addrspace(1) %p.1
+  store i32 %c, ptr addrspace(1) %p.2
+  store i32 %d, ptr addrspace(1) %p.3
+
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop1(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop1:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6
+; CHECK-NEXT:    v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT:    v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21
+; CHECK-NEXT:    v_mov_b32_e32 v31, v20
+; CHECK-NEXT:    image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
+; CHECK-NEXT:    v_cmpx_eq_f32_e32 0, v20
+; CHECK-NEXT:    s_cbranch_execz .LBB1_2
+; CHECK-NEXT:  ; %bb.1: ; %if
+; CHECK-NEXT:    global_load_b64 v[6:7], v[12:13], off
+; CHECK-NEXT:    global_load_b64 v[34:35], v[14:15], off
+; CHECK-NEXT:    global_load_b64 v[36:37], v[16:17], off
+; CHECK-NEXT:    global_load_b64 v[38:39], v[18:19], off
+; CHECK-NEXT:    s_wait_loadcnt 0x2
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, v7, v35
+; CHECK-NEXT:    s_wait_loadcnt 0x1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v34, v36
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v1, v1, v37, v39
+; CHECK-NEXT:    v_add3_u32 v1, v6, v38, v1
+; CHECK-NEXT:  .LBB1_2: ; %end
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    ds_bvh_stack_push8_pop1_rtn_b32 v1, v0, v1, v[21:28]
+; CHECK-NEXT:    image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    global_store_b32 v[12:13], v1, off
+; CHECK-NEXT:    global_store_b32 v[14:15], v0, off
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[16:17], v20, off
+; CHECK-NEXT:    global_store_b32 v[18:19], v21, off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr)
+  %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
+  %val.0 = extractelement <10 x i32> %a, i32 0
+  %val.1 = extractelement <10 x i32> %a, i32 1
+  %val.2 = extractelement <10 x i32> %a, i32 2
+  %val.3 = extractelement <10 x i32> %a, i32 3
+  %val.4 = extractelement <10 x i32> %a, i32 4
+  %val.5 = extractelement <10 x i32> %a, i32 5
+  %val.6 = extractelement <10 x i32> %a, i32 6
+  %val.7 = extractelement <10 x i32> %a, i32 7
+  %bvh.0 = insertelement <8 x i32> poison, i32 %val.0, i32 0
+  %bvh.1 = insertelement <8 x i32> %bvh.0, i32 %val.1, i32 1
+  %bvh.2 = insertelement <8 x i32> %bvh.1, i32 %val.2, i32 2
+  %bvh.3 = insertelement <8 x i32> %bvh.2, i32 %val.3, i32 3
+  %bvh.4 = insertelement <8 x i32> %bvh.3, i32 %val.4, i32 4
+  %bvh.5 = insertelement <8 x i32> %bvh.4, i32 %val.5, i32 5
+  %bvh.6 = insertelement <8 x i32> %bvh.5, i32 %val.6, i32 6
+  %bvh = insertelement <8 x i32> %bvh.6, i32 %val.7, i32 7
+  %cnd = fcmp oeq float %ray_origin_x, 0.0
+  br i1 %cnd, label %if, label %end
+
+if:
+  ; loads to force vgpr pressure
+  %load.0 = load <2 x i32>, ptr addrspace(1) %p.0
+  %load.1 = load <2 x i32>, ptr addrspace(1) %p.1
+  %load.2 = load <2 x i32>, ptr addrspace(1) %p.2
+  %load.3 = load <2 x i32>, ptr addrspace(1) %p.3
+  %add.0 = add <2 x i32> %load.0, %load.1
+  %add.1 = add <2 x i32> %add.0, %load.2
+  %add.2 = add <2 x i32> %add.1, %load.3
+  %.i0 = extractelement <2 x i32> %add.2, i32 0
+  %.i1 = extractelement <2 x i32> %add.2, i32 1
+  %data.1 = add i32 %.i0, %.i1
+  br label %end
+
+end:
+  %data = phi i32 [ %data.0, %entry ], [ %data.1, %if ]
+  %pair = call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(i32 %addr, i32 %data, <8 x i32> %bvh, i32 0)
+  %vdst = extractvalue { i32, i32 } %pair, 0
+  %newaddr = extractvalue { i32, i32 } %pair, 1
+
+  ; keep all intersect ray parameters live
+  %new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
+  %new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
+  %v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr)
+  %b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0
+  %c = extractelement <10 x i32> %b, i32 0
+  %d = extractelement <10 x i32> %b, i32 1
+
+  ; stores keep pointers live
+  store i32 %vdst, ptr addrspace(1) %p.0
+  store i32 %newaddr, ptr addrspace(1) %p.1
+  store i32 %c, ptr addrspace(1) %p.2
+  store i32 %d, ptr addrspace(1) %p.3
+
+  ret void
+}
+
+define amdgpu_gs void @test_ds_bvh_stack_push8_pop2(i32 %addr, i32 %data.0, i64 %node_ptr, float %ray_extent, float %ray_origin_x, float %ray_origin_y, float %ray_origin_z, float %ray_dir_x, float %ray_dir_y, float %ray_dir_z, i32 %offset, <4 x i32> inreg %tdescr, ptr addrspace(1) %p.0, ptr addrspace(1) %p.1, ptr addrspace(1) %p.2, ptr addrspace(1) %p.3) {
+; CHECK-LABEL: test_ds_bvh_stack_push8_pop2:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    v_dual_mov_b32 v22, v7 :: v_dual_mov_b32 v21, v6
+; CHECK-NEXT:    v_dual_mov_b32 v20, v5 :: v_dual_mov_b32 v5, 0
+; CHECK-NEXT:    s_mov_b32 s4, exec_lo
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT:    v_dual_mov_b32 v33, v22 :: v_dual_mov_b32 v32, v21
+; CHECK-NEXT:    v_mov_b32_e32 v31, v20
+; CHECK-NEXT:    image_bvh8_intersect_ray v[21:30], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
+; CHECK-NEXT:    v_cmpx_eq_f32_e32 0, v20
+; CHECK-NEXT:    s_cbranch_execz .LBB2_2
+; CHECK-NEXT:  ; %bb.1: ; %if
+; CHECK-NEXT:    global_load_b64 v[6:7], v[12:13], off
+; CHECK-NEXT:    global_load_b64 v[34:35], v[14:15], off
+; CHECK-NEXT:    global_load_b64 v[36:37], v[16:17], off
+; CHECK-NEXT:    global_load_b64 v[38:39], v[18:19], off
+; CHECK-NEXT:    s_wait_loadcnt 0x2
+; CHECK-NEXT:    v_add_nc_u32_e32 v1, v7, v35
+; CHECK-NEXT:    s_wait_loadcnt 0x1
+; CHECK-NEXT:    v_add3_u32 v6, v6, v34, v36
+; CHECK-NEXT:    s_wait_loadcnt 0x0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add3_u32 v1, v1, v37, v39
+; CHECK-NEXT:    v_add3_u32 v1, v6, v38, v1
+; CHECK-NEXT:  .LBB2_2: ; %end
+; CHECK-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    ds_bvh_stack_push8_pop2_rtn_b64 v[6:7], v0, v1, v[21:28]
+; CHECK-NEXT:    image_bvh8_intersect_ray v[20:29], [v[2:3], v[4:5], v[31:33], v[8:10], v11], s[0:3]
+; CHECK-NEXT:    s_wait_dscnt 0x0
+; CHECK-NEXT:    global_store_b64 v[12:13], v[6:7], off
+; CHECK-NEXT:    global_store_b32 v[14:15], v0, off
+; CHECK-NEXT:    s_wait_bvhcnt 0x0
+; CHECK-NEXT:    global_store_b32 v[16:17], v20, off
+; CHECK-NEXT:    global_store_b32 v[18:19], v21, off
+; CHECK-NEXT:    s_endpgm
+entry:
+  %ray_origin0 = insertelement <3 x float> poison, float %ray_origin_x, i32 0
+  %ray_origin1 = insertelement <3 x float> %ray_origin0, float %ray_origin_y, i32 1
+  %ray_origin = insertelement <3 x float> %ray_origin1, float %ray_origin_z, i32 2
+  %ray_dir0 = insertelement <3 x float> poison, float %ray_dir_x, i32 0
+  %ray_dir1 = insertelement <3 x float> %ray_dir0, float %ray_dir_y, i32 1
+  %ray_dir = insertelement <3 x float> %ray_dir1, float %ray_dir_z, i32 2
+  %v = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %ray_origin, <3 x float> %ray_dir, i32 %offset, <4 x i32> %tdescr)
+  %a = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 0
+  %val.0 = extractelement <10 x i32> %a, i32 0
+  %val.1 = extractelement <10 x i32> %a, i32 1
+  %val.2 = extractelement <10 x i32> %a, i32 2
+  %val.3 = extractelement <10 x i32> %a, i32 3
+  %val.4 = extractelement <10 x i32> %a, i32 4
+  %val.5 = extractelement <10 x i32> %a, i32 5
+  %val.6 = extractelement <10 x i32> %a, i32 6
+  %val.7 = extractelement <10 x i32> %a, i32 7
+  %bvh.0 = insertelement <8 x i32> poison, i32 %val.0, i32 0
+  %bvh.1 = insertelement <8 x i32> %bvh.0, i32 %val.1, i32 1
+  %bvh.2 = insertelement <8 x i32> %bvh.1, i32 %val.2, i32 2
+  %bvh.3 = insertelement <8 x i32> %bvh.2, i32 %val.3, i32 3
+  %bvh.4 = insertelement <8 x i32> %bvh.3, i32 %val.4, i32 4
+  %bvh.5 = insertelement <8 x i32> %bvh.4, i32 %val.5, i32 5
+  %bvh.6 = insertelement <8 x i32> %bvh.5, i32 %val.6, i32 6
+  %bvh = insertelement <8 x i32> %bvh.6, i32 %val.7, i32 7
+  %cnd = fcmp oeq float %ray_origin_x, 0.0
+  br i1 %cnd, label %if, label %end
+
+if:
+  ; loads to force vgpr pressure
+  %load.0 = load <2 x i32>, ptr addrspace(1) %p.0
+  %load.1 = load <2 x i32>, ptr addrspace(1) %p.1
+  %load.2 = load <2 x i32>, ptr addrspace(1) %p.2
+  %load.3 = load <2 x i32>, ptr addrspace(1) %p.3
+  %add.0 = add <2 x i32> %load.0, %load.1
+  %add.1 = add <2 x i32> %add.0, %load.2
+  %add.2 = add <2 x i32> %add.1, %load.3
+  %.i0 = extractelement <2 x i32> %add.2, i32 0
+  %.i1 = extractelement <2 x i32> %add.2, i32 1
+  %data.1 = add i32 %.i0, %.i1
+  br label %end
+
+end:
+  %data = phi i32 [ %data.0, %entry ], [ %data.1, %if ]
+  %pair = call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(i32 %addr, i32 %data, <8 x i32> %bvh, i32 0)
+  %vdst = extractvalue { i64, i32 } %pair, 0
+  %newaddr = extractvalue { i64, i32 } %pair, 1
+
+  ; keep all intersect ray parameters live
+  %new.origin = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 1
+  %new.dir = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v, 2
+  %v.2 = call {<10 x i32>, <3 x float>, <3 x float>} @llvm.amdgcn.image.bvh8.intersect.ray(i64 %node_ptr, float %ray_extent, i8 0, <3 x float> %new.origin, <3 x float> %new.dir, i32 %offset, <4 x i32> %tdescr)
+  %b = extractvalue {<10 x i32>, <3 x float>, <3 x float>} %v.2, 0
+  %c = extractelement <10 x i32> %b, i32 0
+  %d = extractelement <10 x i32> %b, i32 1
+
+  ; stores keep pointers live
+  store i64 %vdst, ptr addrspace(1) %p.0
+  store i32 %newaddr, ptr addrspace(1) %p.1
+  store i32 %c, ptr addrspace(1) %p.2
+  store i32 %d, ptr addrspace(1) %p.3
+
+  ret void
+}

From 7ae1962cf848637ec3b4b03616235dedb18eb5db Mon Sep 17 00:00:00 2001
From: argothiel <argothiel@interia.pl>
Date: Thu, 14 May 2026 11:08:22 +0200
Subject: [PATCH 46/95] [clangd] Fix parens suppression in mid-identifier
 code-completion (#197249)

When completing in the middle of an existing identifier (e.g.
`fo^o<int>(42)`), the next-token check lexes the character immediately
after the cursor, which prevents parens suppression to kick in.

After the fix, we go to the end of the current identifier first and only
then we start lexing for the next token, which handles redundant parens
even when the cursor is mid-identifier.

This also fixes the parens suppression in the replace mode which by
design is used mid-identifier.

Fixes https://github.com/clangd/clangd/issues/387
---
 clang-tools-extra/clangd/CodeComplete.cpp     | 75 +++++--------------
 .../clangd/unittests/CodeCompleteTests.cpp    | 17 ++++-
 2 files changed, 31 insertions(+), 61 deletions(-)

diff --git a/clang-tools-extra/clangd/CodeComplete.cpp b/clang-tools-extra/clangd/CodeComplete.cpp
index 71189ab9de8b2..9f7da6b6f8fac 100644
--- a/clang-tools-extra/clangd/CodeComplete.cpp
+++ b/clang-tools-extra/clangd/CodeComplete.cpp
@@ -1536,46 +1536,6 @@ FuzzyFindRequest speculativeFuzzyFindRequestForCompletion(
   return CachedReq;
 }
 
-// This function is similar to Lexer::findNextToken(), but assumes
-// that the input SourceLocation is the completion point (which is
-// a case findNextToken() does not handle).
-std::optional<Token>
-findTokenAfterCompletionPoint(SourceLocation CompletionPoint,
-                              const SourceManager &SM,
-                              const LangOptions &LangOpts) {
-  SourceLocation Loc = CompletionPoint;
-  if (Loc.isMacroID()) {
-    if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
-      return std::nullopt;
-  }
-
-  // Advance to the next SourceLocation after the completion point.
-  // Lexer::findNextToken() would call MeasureTokenLength() here,
-  // which does not handle the completion point (and can't, because
-  // the Lexer instance it constructs internally doesn't have a
-  // Preprocessor and so doesn't know about the completion point).
-  Loc = Loc.getLocWithOffset(1);
-
-  // Break down the source location.
-  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
-
-  // Try to load the file buffer.
-  bool InvalidTemp = false;
-  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
-  if (InvalidTemp)
-    return std::nullopt;
-
-  const char *TokenBegin = File.data() + LocInfo.second;
-
-  // Lex from the start of the given location.
-  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
-                 TokenBegin, File.end());
-  // Find the token.
-  Token Tok;
-  TheLexer.LexFromRawLexer(Tok);
-  return Tok;
-}
-
 // Runs Sema-based (AST) and Index-based completion, returns merged results.
 //
 // There are a few tricky considerations:
@@ -1619,6 +1579,9 @@ class CodeCompleteFlow {
   // location is an opening parenthesis (tok::l_paren) because this would add
   // extra parenthesis.
   tok::TokenKind NextTokenKind = tok::eof;
+  // End of the identifier suffix after the completion cursor.
+  // Shared by NextTokenKind detection and replace-range calculation.
+  SourceLocation IdentifierSuffixEnd;
   // Counters for logging.
   int NSema = 0, NIndex = 0, NSemaAndIndex = 0, NIdent = 0;
   bool Incomplete = false; // Would more be available with a higher limit?
@@ -1675,11 +1638,20 @@ class CodeCompleteFlow {
       auto Style = getFormatStyleForFile(SemaCCInput.FileName,
                                          SemaCCInput.ParseInput.Contents,
                                          *SemaCCInput.ParseInput.TFS, false);
-      const auto NextToken = findTokenAfterCompletionPoint(
-          Recorder->CCSema->getPreprocessor().getCodeCompletionLoc(),
-          Recorder->CCSema->getSourceManager(), Recorder->CCSema->LangOpts);
-      if (NextToken)
-        NextTokenKind = NextToken->getKind();
+      const auto &SM = Recorder->CCSema->getSourceManager();
+      const LangOptions &LangOpts = Recorder->CCSema->getLangOpts();
+      // Skip past the NUL byte inserted at the cursor, then scan through any
+      // identifier continuation characters to find where the suffix ends.
+      IdentifierSuffixEnd = Lexer::findEndOfIdentifierContinuation(
+          Recorder->CCSema->getPreprocessor()
+              .getCodeCompletionLoc()
+              .getLocWithOffset(1),
+          SM, LangOpts);
+      // Lex the token after the identifier suffix to determine NextTokenKind.
+      if (Token NextToken;
+          !Lexer::getRawToken(IdentifierSuffixEnd, NextToken, SM, LangOpts,
+                              /*IgnoreWhiteSpace=*/true))
+        NextTokenKind = NextToken.getKind();
       // If preprocessor was run, inclusions from preprocessor callback should
       // already be added to Includes.
       Inserter.emplace(
@@ -1696,7 +1668,6 @@ class CodeCompleteFlow {
       // that happens here (though the per-URI-scheme initialization is lazy).
       // The per-result proximity scoring is (amortized) very cheap.
       FileDistanceOptions ProxOpts{}; // Use defaults.
-      const auto &SM = Recorder->CCSema->getSourceManager();
       llvm::StringMap<SourceParams> ProxSources;
       auto MainFileID =
           Includes.getID(SM.getFileEntryForID(SM.getMainFileID()));
@@ -1905,17 +1876,7 @@ class CodeCompleteFlow {
   // Returns the LSP position at the end of the identifier suffix after the
   // code completion cursor.
   Position getEndOfCodeCompletionReplace(const SourceManager &SM) {
-    const Preprocessor &PP = Recorder->CCSema->getPreprocessor();
-    const LangOptions &LangOpts = Recorder->CCSema->getLangOpts();
-
-    // Skip past the code completion NUL byte and scan forward through
-    // identifier continuation characters (letters, digits, _, $, UCN,
-    // unicode). This handles all cases uniformly: with prefix ("vac^1abc"),
-    // without prefix ("vec.^asdf"), and digit-starting ("vec.^1abc").
-    const SourceLocation SuffixBegin =
-        PP.getCodeCompletionLoc().getLocWithOffset(1);
-    Position End = sourceLocToPosition(
-        SM, Lexer::findEndOfIdentifierContinuation(SuffixBegin, SM, LangOpts));
+    Position End = sourceLocToPosition(SM, IdentifierSuffixEnd);
     // Adjust for the NUL byte inserted at the cursor by code completion,
     // which inflates the column by 1.
     End.character--;
diff --git a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
index 726fee9c2f0fe..f3a432a3b2632 100644
--- a/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
+++ b/clang-tools-extra/clangd/unittests/CodeCompleteTests.cpp
@@ -4377,15 +4377,16 @@ TEST(CompletionTest, FunctionArgsExist) {
   EXPECT_THAT(
       completions(Context + "int y = fo^(42)", {}, Opts).Completions,
       UnorderedElementsAre(AllOf(labeled("foo(int A)"), snippetSuffix(""))));
-  // FIXME(kirillbobyrev): No snippet should be produced here.
-  EXPECT_THAT(completions(Context + "int y = fo^o(42)", {}, Opts).Completions,
-              UnorderedElementsAre(
-                  AllOf(labeled("foo(int A)"), snippetSuffix("(${1:int A})"))));
+  EXPECT_THAT(
+      completions(Context + "int y = fo^o(42)", {}, Opts).Completions,
+      UnorderedElementsAre(AllOf(labeled("foo(int A)"), snippetSuffix(""))));
   EXPECT_THAT(
       completions(Context + "int y = ba^", {}, Opts).Completions,
       UnorderedElementsAre(AllOf(labeled("bar()"), snippetSuffix("()"))));
   EXPECT_THAT(completions(Context + "int y = ba^()", {}, Opts).Completions,
               UnorderedElementsAre(AllOf(labeled("bar()"), snippetSuffix(""))));
+  EXPECT_THAT(completions(Context + "int y = ba^r()", {}, Opts).Completions,
+              UnorderedElementsAre(AllOf(labeled("bar()"), snippetSuffix(""))));
   EXPECT_THAT(
       completions(Context + "Object o = Obj^", {}, Opts).Completions,
       Contains(AllOf(labeled("Object(int B)"), snippetSuffix("(${1:int B})"),
@@ -4408,9 +4409,17 @@ TEST(CompletionTest, FunctionArgsExist) {
       Contains(AllOf(labeled("Container<typename T>(int Size)"),
                      snippetSuffix(""),
                      kind(CompletionItemKind::Constructor))));
+  EXPECT_THAT(
+      completions(Context + "Container c = Cont^ainer()", {}, Opts).Completions,
+      Contains(AllOf(labeled("Container<typename T>(int Size)"),
+                     snippetSuffix("<${1:typename T}>"),
+                     kind(CompletionItemKind::Constructor))));
   EXPECT_THAT(completions(Context + "MAC^(2)", {}, Opts).Completions,
               Contains(AllOf(labeled("MACRO(x)"), snippetSuffix(""),
                              kind(CompletionItemKind::Function))));
+  EXPECT_THAT(completions(Context + "MAC^RO(2)", {}, Opts).Completions,
+              Contains(AllOf(labeled("MACRO(x)"), snippetSuffix(""),
+                             kind(CompletionItemKind::Function))));
 }
 
 TEST(CompletionTest, FunctionArgsExist_Issue1785) {

From 3fda43d2e8560311a9f349666aaf3ca29cac54b6 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora@amd.com>
Date: Thu, 14 May 2026 11:19:35 +0200
Subject: [PATCH 47/95] [AMDGPU] Update permlane_bcast/down/up/xor intrinsic to
 support more types (#197141)

Co-authored-by: Acim Maravic <Acim.Maravic@amd.com>
---
 clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp   |   12 +
 .../CodeGenOpenCL/builtins-amdgcn-gfx1250.cl  |    8 +-
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |   24 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   18 +-
 .../AMDGPU/AMDGPURegBankLegalizeRules.cpp     |    8 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   33 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   14 +-
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |    8 +-
 .../AMDGPU/llvm.amdgcn.permlane.bcast.ll      | 1105 ++++++
 .../AMDGPU/llvm.amdgcn.permlane.down.ll       | 1105 ++++++
 .../AMDGPU/llvm.amdgcn.permlane.gfx1250.ll    |  440 ---
 .../AMDGPU/llvm.amdgcn.permlane.idx.gen.ll    |  103 +
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.ll    | 3435 +++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll | 1105 ++++++
 .../AMDGPU/llvm.amdgcn.permlane.xor.ll        | 1105 ++++++
 15 files changed, 8043 insertions(+), 480 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll

diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index cfad312d7535a..751cd9847bd31 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -2205,6 +2205,18 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case Builtin::BI__builtin_scalbn:
     return emitBinaryExpMaybeConstrainedFPBuiltin(
         *this, E, Intrinsic::ldexp, Intrinsic::experimental_constrained_ldexp);
+  case AMDGPU::BI__builtin_amdgcn_permlane_bcast:
+    return emitBuiltinWithOneOverloadedType<3>(
+        *this, E, Intrinsic::amdgcn_permlane_bcast);
+  case AMDGPU::BI__builtin_amdgcn_permlane_up:
+    return emitBuiltinWithOneOverloadedType<3>(*this, E,
+                                               Intrinsic::amdgcn_permlane_up);
+  case AMDGPU::BI__builtin_amdgcn_permlane_down:
+    return emitBuiltinWithOneOverloadedType<3>(*this, E,
+                                               Intrinsic::amdgcn_permlane_down);
+  case AMDGPU::BI__builtin_amdgcn_permlane_xor:
+    return emitBuiltinWithOneOverloadedType<3>(*this, E,
+                                               Intrinsic::amdgcn_permlane_xor);
   default:
     return nullptr;
   }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index c40172a0d7fcd..8b09216057167 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -1189,7 +1189,7 @@ void test_permlane16_swap(global uint2* out, uint old, uint src) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.bcast.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
@@ -1211,7 +1211,7 @@ void test_permlane_bcast(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.down.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
@@ -1233,7 +1233,7 @@ void test_permlane_down(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.up.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
@@ -1255,7 +1255,7 @@ void test_permlane_up(global uint* out, uint src0, uint src1, uint src2) {
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[SRC0_ADDR]], align 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(5) [[SRC1_ADDR]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr addrspace(5) [[SRC2_ADDR]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.permlane.xor.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
 // CHECK-NEXT:    ret void
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8631985de9a0a..63920e91ffcaf 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3763,27 +3763,27 @@ def int_amdgcn_sat_pk4_u4_u8 : ClangBuiltin<"__builtin_amdgcn_sat_pk4_u4_u8">,
   PureIntrinsic<[llvm_i16_ty], [llvm_i32_ty]>;
 
 // llvm.amdgcn.permlane.bcast <src0> <src1> <src2>
-def int_amdgcn_permlane_bcast : ClangBuiltin<"__builtin_amdgcn_permlane_bcast">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+def int_amdgcn_permlane_bcast :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // llvm.amdgcn.permlane.up <src0> <src1> <src2>
-def int_amdgcn_permlane_up : ClangBuiltin<"__builtin_amdgcn_permlane_up">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+def int_amdgcn_permlane_up :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // llvm.amdgcn.permlane.down <src0> <src1> <src2>
-def int_amdgcn_permlane_down : ClangBuiltin<"__builtin_amdgcn_permlane_down">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+def int_amdgcn_permlane_down :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // llvm.amdgcn.permlane.xor <src0> <src1> <src2>
-def int_amdgcn_permlane_xor : ClangBuiltin<"__builtin_amdgcn_permlane_xor">,
-  Intrinsic<[llvm_i32_ty],
-            [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+def int_amdgcn_permlane_xor :
+  Intrinsic<[llvm_any_ty],
+            [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // llvm.amdgcn.permlane.idx.gen <src0> <src1>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e38532fe315c..1a0e4f2eaa416 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6130,6 +6130,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
                       IID == Intrinsic::amdgcn_permlanex16;
   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
+  bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
+                           IID == Intrinsic::amdgcn_permlane_up ||
+                           IID == Intrinsic::amdgcn_permlane_down ||
+                           IID == Intrinsic::amdgcn_permlane_xor;
 
   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
                                       Register Src2, LLT VT) -> Register {
@@ -6143,6 +6147,10 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     case Intrinsic::amdgcn_set_inactive_chain_arg:
       return LaneOp.addUse(Src1).getReg(0);
     case Intrinsic::amdgcn_writelane:
+    case Intrinsic::amdgcn_permlane_bcast:
+    case Intrinsic::amdgcn_permlane_up:
+    case Intrinsic::amdgcn_permlane_down:
+    case Intrinsic::amdgcn_permlane_xor:
       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
     case Intrinsic::amdgcn_permlane16:
     case Intrinsic::amdgcn_permlanex16: {
@@ -6174,9 +6182,11 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register Src0 = MI.getOperand(2).getReg();
   Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
+      IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
+      IsPermlaneShuffle) {
     Src1 = MI.getOperand(3).getReg();
-    if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
+    if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
+        IsPermlaneShuffle) {
       Src2 = MI.getOperand(4).getReg();
     }
   }
@@ -8451,6 +8461,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_set_inactive_chain_arg:
   case Intrinsic::amdgcn_mov_dpp8:
   case Intrinsic::amdgcn_update_dpp:
+  case Intrinsic::amdgcn_permlane_bcast:
+  case Intrinsic::amdgcn_permlane_up:
+  case Intrinsic::amdgcn_permlane_down:
+  case Intrinsic::amdgcn_permlane_xor:
     return legalizeLaneOp(Helper, MI, IntrID);
   case Intrinsic::amdgcn_s_buffer_prefetch_data:
     return legalizeSBufferPrefetch(Helper, MI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
index 36aeef3558672..ca29635578945 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp
@@ -1704,10 +1704,10 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST,
 
   addRulesForIOpcs({amdgcn_permlane_bcast, amdgcn_permlane_up,
                     amdgcn_permlane_down, amdgcn_permlane_xor},
-                   Standard)
-      .Div(S32,
-           {{Vgpr32},
-            {IntrId, Vgpr32, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}});
+                   StandardB)
+      .Div(B32,
+           {{VgprB32},
+            {IntrId, VgprB32, SgprB32_ReadFirstLane, SgprB32_ReadFirstLane}});
 
   addRulesForIOpcs({amdgcn_permlane_idx_gen}, Standard)
       .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, SgprB32_ReadFirstLane}});
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3c830726bf98a..a1c14fa0f4521 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -7861,6 +7861,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                       IID == Intrinsic::amdgcn_permlanex16;
   bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
                        IID == Intrinsic::amdgcn_set_inactive_chain_arg;
+  bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
+                           IID == Intrinsic::amdgcn_permlane_up ||
+                           IID == Intrinsic::amdgcn_permlane_down ||
+                           IID == Intrinsic::amdgcn_permlane_xor;
   SDLoc SL(N);
   MVT IntVT = MVT::getIntegerVT(ValSize);
   const GCNSubtarget *ST = TLI.getSubtarget();
@@ -7882,6 +7886,10 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
       Operands.push_back(N->getOperand(4));
       [[fallthrough]];
     case Intrinsic::amdgcn_writelane:
+    case Intrinsic::amdgcn_permlane_bcast:
+    case Intrinsic::amdgcn_permlane_up:
+    case Intrinsic::amdgcn_permlane_down:
+    case Intrinsic::amdgcn_permlane_xor:
       Operands.push_back(Src2);
       [[fallthrough]];
     case Intrinsic::amdgcn_readlane:
@@ -7915,10 +7923,12 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDValue Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
       IID == Intrinsic::amdgcn_mov_dpp8 ||
-      IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
+      IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
+      IsPermlaneShuffle) {
     Src1 = N->getOperand(2);
     if (IID == Intrinsic::amdgcn_writelane ||
-        IID == Intrinsic::amdgcn_update_dpp || IsPermLane16)
+        IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||
+        IsPermlaneShuffle)
       Src2 = N->getOperand(3);
   }
 
@@ -8013,18 +8023,21 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                                  DAG.getConstant(EltIdx, SL, MVT::i32));
 
         if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
-            IsPermLane16)
+            IsPermLane16) {
           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
-        if (IID == Intrinsic::amdgcn_writelane)
+          Pieces.push_back(
+              createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));
+        } else if (IID == Intrinsic::amdgcn_writelane) {
           Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
+          Pieces.push_back(
+              createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
+        } else {
+          Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));
+        }
 
-        Pieces.push_back(
-            IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16
-                ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
-                : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
         EltIdx += SubVecNumElt;
       }
       return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
@@ -11150,6 +11163,10 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_set_inactive_chain_arg:
   case Intrinsic::amdgcn_mov_dpp8:
   case Intrinsic::amdgcn_update_dpp:
+  case Intrinsic::amdgcn_permlane_bcast:
+  case Intrinsic::amdgcn_permlane_up:
+  case Intrinsic::amdgcn_permlane_down:
+  case Intrinsic::amdgcn_permlane_xor:
     return lowerLaneOp(*this, Op.getNode(), DAG);
   case Intrinsic::amdgcn_dead: {
     SmallVector<SDValue, 8> Poisons;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 3a477cad4248c..1c6447de407f3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1205,8 +1205,8 @@ class PermlaneVarPat<SDPatternOperator permlane,
 >;
 
 class PermlaneNoDppPat3Src<SDPatternOperator permlane,
-  Instruction inst> : GCNPat<
-  (permlane i32:$src0, i32:$src1, i32:$src2),
+  Instruction inst, ValueType vt> : GCNPat<
+  (vt (permlane vt:$src0, i32:$src1, i32:$src2)),
   (inst VGPR_32:$src0, SCSrc_b32:$src1, SCSrc_b32:$src2)
 >;
 
@@ -1611,10 +1611,12 @@ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32 in {
   defm V_PERMLANE_IDX_GEN_B32 : VOP3Inst<"v_permlane_idx_gen_b32", VOP3_PERMLANE_NOOPSEL_Profile<VOP_I32_I32_I32>>;
   } // End isConvergent = 1
 
-  def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast,   V_PERMLANE_BCAST_B32_e64>;
-  def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up,      V_PERMLANE_UP_B32_e64>;
-  def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down,    V_PERMLANE_DOWN_B32_e64>;
-  def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor,     V_PERMLANE_XOR_B32_e64>;
+  foreach vt = Reg32Types.types in {
+    def : PermlaneNoDppPat3Src<int_amdgcn_permlane_bcast,   V_PERMLANE_BCAST_B32_e64, vt>;
+    def : PermlaneNoDppPat3Src<int_amdgcn_permlane_up,      V_PERMLANE_UP_B32_e64,    vt>;
+    def : PermlaneNoDppPat3Src<int_amdgcn_permlane_down,    V_PERMLANE_DOWN_B32_e64,  vt>;
+    def : PermlaneNoDppPat3Src<int_amdgcn_permlane_xor,     V_PERMLANE_XOR_B32_e64,   vt>;
+  }
   def : PermlaneNoDppPat2Src<int_amdgcn_permlane_idx_gen, V_PERMLANE_IDX_GEN_B32_e64>;
 } // End SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32
 
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 68663ae820b57..9c57f1f2e5367 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -802,28 +802,28 @@ define amdgpu_kernel void @v_permlane32_swap(ptr addrspace(1) %out, i32 %src0, i
   ret void
 }
 
-; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
+; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.bcast.i32(i32 %src0, i32 %src1, i32 %src2)
 define amdgpu_kernel void @v_permlane_bcast_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
   %result= call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
   store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
+; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.up.i32(i32 %src0, i32 %src1, i32 %src2)
 define amdgpu_kernel void @v_permlane_up_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
   %result= call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
   store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
+; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.down.i32(i32 %src0, i32 %src1, i32 %src2)
 define amdgpu_kernel void @v_permlane_down_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
   %result= call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
   store i32 %result, ptr addrspace(1) %out
   ret void
 }
 
-; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
+; CHECK: DIVERGENT:  %result = call i32 @llvm.amdgcn.permlane.xor.i32(i32 %src0, i32 %src1, i32 %src2)
 define amdgpu_kernel void @v_permlane_xor_b32(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
   %result= call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
   store i32 %result, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll
new file mode 100644
index 0000000000000..1498c0a57b96f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.bcast.ll
@@ -0,0 +1,1105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s
+
+define i32 @v_permlane_bcast_b32_vss(i32 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_bcast_b32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_b32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_bcast_b32_vii(i32 %src0) {
+; GFX1250-LABEL: v_permlane_bcast_b32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_b32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 1, i32 2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_bcast_b32_vll(i32 %src0) {
+; GFX1250-LABEL: v_permlane_bcast_b32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_b32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 100, i32 102)
+  ret i32 %v
+}
+
+define i32 @v_permlane_bcast_b32_vvv(i32 %src0) {
+; GFX1250-LABEL: v_permlane_bcast_b32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_b32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %tidx, i32 %tidy)
+  ret i32 %v
+}
+
+define float @v_permlane_bcast_f32_vss(float %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_bcast_f32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 %src1, i32 %src2)
+  ret float %v
+}
+
+define float @v_permlane_bcast_f32_vii(float %src0) {
+; GFX1250-LABEL: v_permlane_bcast_f32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 1, i32 2)
+  ret float %v
+}
+
+define float @v_permlane_bcast_f32_vll(float %src0) {
+; GFX1250-LABEL: v_permlane_bcast_f32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 100, i32 102)
+  ret float %v
+}
+
+define float @v_permlane_bcast_f32_vvv(float %src0) {
+; GFX1250-LABEL: v_permlane_bcast_f32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlane.bcast(float %src0, i32 %tidx, i32 %tidy)
+  ret float %v
+}
+
+define i64 @v_permlane_bcast_i64_vss(i64 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_bcast_i64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_i64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 %src1, i32 %src2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_bcast_i64_vii(i64 %src0) {
+; GFX1250-LABEL: v_permlane_bcast_i64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_i64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 1, i32 2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_bcast_i64_vll(i64 %src0) {
+; GFX1250-LABEL: v_permlane_bcast_i64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_i64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 100, i32 102)
+  ret i64 %v
+}
+
+define i64 @v_permlane_bcast_i64_vvv(i64 %src0) {
+; GFX1250-LABEL: v_permlane_bcast_i64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_i64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlane.bcast(i64 %src0, i32 %tidx, i32 %tidy)
+  ret i64 %v
+}
+
+define double @v_permlane_bcast_f64_vss(double %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_bcast_f64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 %src1, i32 %src2)
+  ret double %v
+}
+
+define double @v_permlane_bcast_f64_vii(double %src0) {
+; GFX1250-LABEL: v_permlane_bcast_f64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 1, i32 2)
+  ret double %v
+}
+
+define double @v_permlane_bcast_f64_vll(double %src0) {
+; GFX1250-LABEL: v_permlane_bcast_f64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 100, i32 102)
+  ret double %v
+}
+
+define double @v_permlane_bcast_f64_vvv(double %src0) {
+; GFX1250-LABEL: v_permlane_bcast_f64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_f64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_bcast_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlane.bcast(double %src0, i32 %tidx, i32 %tidy)
+  ret double %v
+}
+
+; does not work for GISEL
+;define void @v_permlane_bcast_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) {
+; %v = call bfloat @llvm.amdgcn.permlane.bcast.bf16(bfloat %src, i32 %src1, i32 %src2)
+; store bfloat %v, ptr addrspace(1) %out, align 4
+; ret void
+;}
+
+define void @v_permlane_bcast_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_bcast_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane.bcast.i16(i16 %src, i32 %src1, i32 %src2)
+  store i16 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_bcast_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_bcast_v2f16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlane.bcast.v2f16(<2 x half> %src, i32 %src1, i32 %src2)
+  store <2 x half> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v2f32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v2f32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v2f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v2f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlane.bcast.v2f32(<2 x float> %src, i32 %src1, i32 %src2)
+  store <2 x float> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v7i32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v7i32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v7i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v7i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlane.bcast.v7i32(<7 x i32> %src, i32 %src1, i32 %src2)
+  store <7 x i32> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v8i16:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v8i16:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v8i16:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v8i16:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlane.bcast.v8i16(<8 x i16> %src, i32 %src1, i32 %src2)
+  store <8 x i16> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v2i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v2i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v2i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v2i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x i64> @llvm.amdgcn.permlane.bcast.v2i64(<2 x i64> %src, i32 %src1, i32 %src2)
+  store <2 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v3i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v3i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v3i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v3i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <3 x i64> @llvm.amdgcn.permlane.bcast.v2i64(<3 x i64> %src, i32 %src1, i32 %src2)
+  store <3 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v4f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v4f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v4f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v4f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <4 x double> @llvm.amdgcn.permlane.bcast.v4f64(<4 x double> %src, i32 %src1, i32 %src2)
+  store <4 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_bcast_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_bcast_v8f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v17, v17, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v16, v16, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v15, v15, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v14, v14, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v13, v13, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v12, v12, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v11, v11, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v10, v10, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x3
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_bcast_v8f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v10, v10, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v11, v11, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v12, v12, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v13, v13, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v14, v14, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v15, v15, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v16, v16, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v17, v17, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x3
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_bcast_v8f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v17, v17, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v16, v16, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v15, v15, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v14, v14, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v13, v13, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v12, v12, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v11, v11, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v10, v10, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x3
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_bcast_v8f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v10, v10, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v11, v11, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v12, v12, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v13, v13, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v14, v14, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v15, v15, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v16, v16, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_bcast_b32 v17, v17, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x3
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x double> @llvm.amdgcn.permlane.bcast.v8f64(<8 x double> %src, i32 %src1, i32 %src2)
+  store <8 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll
new file mode 100644
index 0000000000000..75548d5cc0594
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.down.ll
@@ -0,0 +1,1105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s
+
+define i32 @v_permlane_down_b32_vss(i32 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_down_b32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_b32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_down_b32_vii(i32 %src0) {
+; GFX1250-LABEL: v_permlane_down_b32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_b32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 1, i32 2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_down_b32_vll(i32 %src0) {
+; GFX1250-LABEL: v_permlane_down_b32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_b32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 100, i32 102)
+  ret i32 %v
+}
+
+define i32 @v_permlane_down_b32_vvv(i32 %src0) {
+; GFX1250-LABEL: v_permlane_down_b32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_b32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %tidx, i32 %tidy)
+  ret i32 %v
+}
+
+define float @v_permlane_down_f32_vss(float %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_down_f32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 %src1, i32 %src2)
+  ret float %v
+}
+
+define float @v_permlane_down_f32_vii(float %src0) {
+; GFX1250-LABEL: v_permlane_down_f32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 1, i32 2)
+  ret float %v
+}
+
+define float @v_permlane_down_f32_vll(float %src0) {
+; GFX1250-LABEL: v_permlane_down_f32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 100, i32 102)
+  ret float %v
+}
+
+define float @v_permlane_down_f32_vvv(float %src0) {
+; GFX1250-LABEL: v_permlane_down_f32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlane.down(float %src0, i32 %tidx, i32 %tidy)
+  ret float %v
+}
+
+define i64 @v_permlane_down_i64_vss(i64 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_down_i64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_i64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 %src1, i32 %src2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_down_i64_vii(i64 %src0) {
+; GFX1250-LABEL: v_permlane_down_i64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_i64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 1, i32 2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_down_i64_vll(i64 %src0) {
+; GFX1250-LABEL: v_permlane_down_i64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_i64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 100, i32 102)
+  ret i64 %v
+}
+
+define i64 @v_permlane_down_i64_vvv(i64 %src0) {
+; GFX1250-LABEL: v_permlane_down_i64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_i64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlane.down(i64 %src0, i32 %tidx, i32 %tidy)
+  ret i64 %v
+}
+
+define double @v_permlane_down_f64_vss(double %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_down_f64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 %src1, i32 %src2)
+  ret double %v
+}
+
+define double @v_permlane_down_f64_vii(double %src0) {
+; GFX1250-LABEL: v_permlane_down_f64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 1, i32 2)
+  ret double %v
+}
+
+define double @v_permlane_down_f64_vll(double %src0) {
+; GFX1250-LABEL: v_permlane_down_f64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 100, i32 102)
+  ret double %v
+}
+
+define double @v_permlane_down_f64_vvv(double %src0) {
+; GFX1250-LABEL: v_permlane_down_f64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_f64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_down_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlane.down(double %src0, i32 %tidx, i32 %tidy)
+  ret double %v
+}
+
+; does not work for GISEL
+;define void @v_permlane_down_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) {
+; %v = call bfloat @llvm.amdgcn.permlane.down.bf16(bfloat %src, i32 %src1, i32 %src2)
+; store bfloat %v, ptr addrspace(1) %out, align 4
+; ret void
+;}
+
+define void @v_permlane_down_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_down_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane.down.i16(i16 %src, i32 %src1, i32 %src2)
+  store i16 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_down_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_down_v2f16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlane.down.v2f16(<2 x half> %src, i32 %src1, i32 %src2)
+  store <2 x half> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v2f32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v2f32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v2f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v2f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlane.down.v2f32(<2 x float> %src, i32 %src1, i32 %src2)
+  store <2 x float> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v7i32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v7i32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v7i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v7i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlane.down.v7i32(<7 x i32> %src, i32 %src1, i32 %src2)
+  store <7 x i32> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v8i16:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v8i16:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v8i16:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v8i16:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlane.down.v8i16(<8 x i16> %src, i32 %src1, i32 %src2)
+  store <8 x i16> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v2i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v2i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v2i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v2i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x i64> @llvm.amdgcn.permlane.down.v2i64(<2 x i64> %src, i32 %src1, i32 %src2)
+  store <2 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v3i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v3i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v3i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v3i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <3 x i64> @llvm.amdgcn.permlane.down.v2i64(<3 x i64> %src, i32 %src1, i32 %src2)
+  store <3 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v4f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v4f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v4f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v4f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <4 x double> @llvm.amdgcn.permlane.down.v4f64(<4 x double> %src, i32 %src1, i32 %src2)
+  store <4 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_down_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_down_v8f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v17, v17, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v16, v16, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v15, v15, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v14, v14, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v13, v13, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v12, v12, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v11, v11, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v10, v10, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x3
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_down_v8f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v10, v10, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v11, v11, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v12, v12, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v13, v13, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v14, v14, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v15, v15, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v16, v16, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v17, v17, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x3
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_down_v8f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v17, v17, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v16, v16, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v15, v15, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v14, v14, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v13, v13, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v12, v12, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v11, v11, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v10, v10, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x3
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_down_v8f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v10, v10, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v11, v11, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v12, v12, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v13, v13, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v14, v14, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v15, v15, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v16, v16, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_down_b32 v17, v17, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x3
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x double> @llvm.amdgcn.permlane.down.v8f64(<8 x double> %src, i32 %src1, i32 %src2)
+  store <8 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
deleted file mode 100644
index 72a14536bebd4..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
+++ /dev/null
@@ -1,440 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
-; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
-
-define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX1250-LABEL: v_permlane_bcast_b32_vss:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s3, s6
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %src1, i32 %src2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_bcast_b32_vii:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, 1, 2
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 1, i32 2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_bcast_b32_vll:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_movk_i32 s2, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_permlane_bcast_b32 v0, v0, s2, 0x66
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 100, i32 102)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-SDAG-LABEL: v_permlane_bcast_b32_vvv:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-SDAG-NEXT:    v_permlane_bcast_b32 v1, v1, s3, s2
-; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: v_permlane_bcast_b32_vvv:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_permlane_bcast_b32 v0, v0, s3, s4
-; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane.bcast(i32 %src0, i32 %tidx, i32 %tidy)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX1250-LABEL: v_permlane_down_b32_vss:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s3, s6
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %src1, i32 %src2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_down_b32_vii:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, 1, 2
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 1, i32 2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_down_b32_vll:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_movk_i32 s2, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_permlane_down_b32 v0, v0, s2, 0x66
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 100, i32 102)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-SDAG-LABEL: v_permlane_down_b32_vvv:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-SDAG-NEXT:    v_permlane_down_b32 v1, v1, s3, s2
-; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: v_permlane_down_b32_vvv:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_permlane_down_b32 v0, v0, s3, s4
-; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane.down(i32 %src0, i32 %tidx, i32 %tidy)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX1250-LABEL: v_permlane_up_b32_vss:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s3, s6
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_up_b32_vii:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 1, i32 2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_up_b32_vll:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_movk_i32 s2, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s2, 0x66
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 100, i32 102)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-SDAG-LABEL: v_permlane_up_b32_vvv:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v1, v1, s3, s2
-; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: v_permlane_up_b32_vvv:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v0, v0, s3, s4
-; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %tidx, i32 %tidy)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
-; GFX1250-LABEL: v_permlane_xor_b32_vss:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_clause 0x1
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s3, s6
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_xor_b32_vii:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 1, i32 2)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_xor_b32_vll:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_movk_i32 s2, 0x64
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s2, 0x66
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 100, i32 102)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-SDAG-LABEL: v_permlane_xor_b32_vvv:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, s2
-; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v1, v1, s3, s2
-; GFX1250-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX1250-SDAG-NEXT:    s_endpgm
-;
-; GFX1250-GISEL-LABEL: v_permlane_xor_b32_vvv:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
-; GFX1250-GISEL-NEXT:    s_wait_xcnt 0x0
-; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v0, v0, s3, s4
-; GFX1250-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-GISEL-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %tidx, i32 %tidy)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
-; GFX1250-LABEL: v_permlane_idx_gen_b32_vs:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, s3
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_idx_gen_b32_vi:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, 1
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 1)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 %src0) {
-; GFX1250-LABEL: v_permlane_idx_gen_b32_vl:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, 0x64
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 100)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
-
-define amdgpu_kernel void @v_permlane_idx_gen_b32_vv(ptr addrspace(1) %out) {
-; GFX1250-LABEL: v_permlane_idx_gen_b32_vv:
-; GFX1250:       ; %bb.0:
-; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
-; GFX1250-NEXT:    v_bfe_u32 v1, v0, 10, 10
-; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_readfirstlane_b32 s2, v1
-; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, s2
-; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX1250-NEXT:    s_endpgm
-  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
-  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
-  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %tidx, i32 %tidy)
-  store i32 %v, ptr addrspace(1) %out
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll
new file mode 100644
index 0000000000000..887c9cd3d8483
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.idx.gen.ll
@@ -0,0 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250 %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13 %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13 %s
+
+define i32 @v_permlane_idx_gen_b32_vs(i32 %src0, i32 %src1) {
+; GFX1250-LABEL: v_permlane_idx_gen_b32_vs:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_idx_gen_b32_vs:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_idx_gen_b32 v0, v0, s0
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 %src1)
+  ret i32 %v
+}
+
+define i32 @v_permlane_idx_gen_b32_vi(i32 %src0) {
+; GFX1250-LABEL: v_permlane_idx_gen_b32_vi:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, 1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_idx_gen_b32_vi:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_idx_gen_b32 v0, v0, 1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 1)
+  ret i32 %v
+}
+
+define i32 @v_permlane_idx_gen_b32_vl(i32 %src0) {
+; GFX1250-LABEL: v_permlane_idx_gen_b32_vl:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v0, 0x64
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_idx_gen_b32_vl:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_idx_gen_b32 v0, v0, 0x64
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %src0, i32 100)
+  ret i32 %v
+}
+
+define i32 @v_permlane_idx_gen_b32_vv() {
+; GFX1250-LABEL: v_permlane_idx_gen_b32_vv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_bfe_u32 v0, v31, 10, 10
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX1250-NEXT:    v_permlane_idx_gen_b32 v0, v1, s0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_idx_gen_b32_vv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_bfe_u32 v0, v31, 10, 10
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX13-NEXT:    v_permlane_idx_gen_b32 v0, v1, s0
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane.idx.gen(i32 %tidx, i32 %tidy)
+  ret i32 %v
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
index 925e91022a27b..ed3d57d9fbda0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll
@@ -5,6 +5,8 @@
 ; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
 ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
 ; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s
 
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1)
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1)
@@ -47,6 +49,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -88,6 +102,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f32(ptr addrspace(1) %out, float
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -177,6 +203,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -266,6 +320,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -305,6 +387,16 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vii_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -344,6 +436,16 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f32(ptr addrspace(1) %out, float
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vii_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -421,6 +523,30 @@ define amdgpu_kernel void @v_permlane16_b32_vii_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vii_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vii_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -498,6 +624,30 @@ define amdgpu_kernel void @v_permlane16_b32_vii_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vii_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vii_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, 1, 2
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, 1, 2
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -542,6 +692,17 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vll_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -629,6 +790,34 @@ define amdgpu_kernel void @v_permlane16_b32_vll_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vll_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vll_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -672,6 +861,17 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f32(ptr addrspace(1) %out,float
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vll_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -759,6 +959,34 @@ define amdgpu_kernel void @v_permlane16_b32_vll_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vll_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vll_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -862,6 +1090,37 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -935,6 +1194,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -1040,6 +1333,37 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f32(ptr addrspace(1) %out, float
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -1113,6 +1437,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvv_f64(ptr addrspace(1) %out, doubl
 ; GFX12-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvv_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvv_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -1192,6 +1550,32 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -1264,6 +1648,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s5, s4
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s5, s4
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s5, s4
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s5, s4
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s5, s4
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -1342,6 +1760,32 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f32(ptr addrspace(1) %out, float
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s2, s3
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s3
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -1414,6 +1858,40 @@ define amdgpu_kernel void @v_permlane16_b32_vvs_f64(ptr addrspace(1) %out, doubl
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s5, s4
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vvs_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s5, s4
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s5, s4
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vvs_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s5, s4
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s5, s4
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -1496,6 +1974,32 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i32(ptr addrspace(1) %out, i32 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -1602,6 +2106,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_i64(ptr addrspace(1) %out, i64 %
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -1684,6 +2222,32 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f32(ptr addrspace(1) %out, float
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -1790,6 +2354,40 @@ define amdgpu_kernel void @v_permlane16_b32_vsv_f64(ptr addrspace(1) %out, doubl
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vsv_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vsv_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -1832,6 +2430,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i32(ptr addrspace(1) %out, i3
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_fi_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -1921,6 +2531,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_i64(ptr addrspace(1) %out, i6
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -1962,6 +2600,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f32(ptr addrspace(1) %out, fl
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_fi_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -2051,6 +2701,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_f64(ptr addrspace(1) %out, do
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -2092,6 +2770,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i32(ptr addrspace(1) %out, i3
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -2181,6 +2871,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_i64(ptr addrspace(1) %out, i6
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -2222,6 +2940,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f32(ptr addrspace(1) %out, fl
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -2311,6 +3041,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_bc_f64(ptr addrspace(1) %out, do
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -2352,6 +3110,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_fi_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -2441,6 +3211,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -2482,6 +3280,18 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_vss_fi_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s3, s4 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlane16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -2571,6 +3381,34 @@ define amdgpu_kernel void @v_permlane16_b32_vss_fi_bc_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_vss_fi_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlane16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -2612,6 +3450,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -2653,6 +3503,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -2742,6 +3604,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -2831,6 +3721,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -2870,6 +3788,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vii_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 1, i32 2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -2909,6 +3837,16 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vii_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 1, i32 2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -2986,6 +3924,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vii_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vii_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 1, i32 2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -3063,6 +4025,30 @@ define amdgpu_kernel void @v_permlanex16_b32_vii_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vii_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vii_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, 1, 2
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, 1, 2
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 1, i32 2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -3107,6 +4093,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i32(ptr addrspace(1) %out, i32
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vll_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -3150,6 +4147,17 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f32(ptr addrspace(1) %out, floa
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vll_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 4660, i32 49617, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -3237,6 +4245,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vll_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vll_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 4660, i32 49617, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -3324,6 +4360,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vll_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vll_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vll_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_movk_i32 s2, 0x1234
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s2, 0xc1d1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 4660, i32 49617, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -3427,6 +4491,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -3532,6 +4627,37 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f32(ptr addrspace(1) %out, floa
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -3605,6 +4731,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_i64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -3678,6 +4838,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvv_f64(ptr addrspace(1) %out, doub
 ; GFX12-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvv_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %tidy, i1 false, i1 false)
@@ -3757,6 +4951,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -3835,6 +5055,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f32(ptr addrspace(1) %out, floa
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s2, s3
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s3
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -3907,6 +5153,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_i64(ptr addrspace(1) %out, i64
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s5, s4
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s5, s4
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s5, s4
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s5, s4
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s5, s4
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -3979,6 +5259,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vvs_f64(ptr addrspace(1) %out, doub
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s5, s4
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s5, s4
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s5, s4
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vvs_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s5, s4
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s5, s4
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %tidx, i32 %src2, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -4061,6 +5375,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -4143,6 +5483,32 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f32(ptr addrspace(1) %out, floa
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
 ; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s3, s2
+; GFX13-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s3, s4
+; GFX13-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store float %v, ptr addrspace(1) %out
@@ -4249,6 +5615,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_i64(ptr addrspace(1) %out, i64
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store i64 %v, ptr addrspace(1) %out
@@ -4355,6 +5755,40 @@ define amdgpu_kernel void @v_permlanex16_b32_vsv_f64(ptr addrspace(1) %out, doub
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vsv_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidy = call i32 @llvm.amdgcn.workitem.id.y()
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %tidy, i1 false, i1 false)
   store double %v, ptr addrspace(1) %out
@@ -4397,6 +5831,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_fi_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -4438,6 +5884,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f32(ptr addrspace(1) %out, f
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_fi_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -4527,6 +5985,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_i64(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -4616,6 +6102,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_f64(ptr addrspace(1) %out, d
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 false)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -4657,6 +6171,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -4698,6 +6224,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f32(ptr addrspace(1) %out, f
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -4787,6 +6325,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_i64(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -4876,6 +6442,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_bc_f64(ptr addrspace(1) %out, d
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 false, i1 true)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -4917,6 +6511,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_fi_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i32 %v, ptr addrspace(1) %out
   ret void
@@ -4958,6 +6564,18 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_vss_fi_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    s_load_b32 s4, s[4:5], 0x34 nv
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s3, s4 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX13-NEXT:    s_endpgm
   %v = call float @llvm.amdgcn.permlanex16.f32(float %src0, float %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store float %v, ptr addrspace(1) %out
   ret void
@@ -5047,6 +6665,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_i64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %src0, i64 %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store i64 %v, ptr addrspace(1) %out
   ret void
@@ -5136,6 +6782,34 @@ define amdgpu_kernel void @v_permlanex16_b32_vss_fi_bc_f64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_vss_fi_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34 nv
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s4, s5 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX13-GISEL-NEXT:    s_endpgm
   %v = call double @llvm.amdgcn.permlanex16.f64(double %src0, double %src0, i32 %src1, i32 %src2, i1 true, i1 true)
   store double %v, ptr addrspace(1) %out
   ret void
@@ -5176,6 +6850,19 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_tid_tid_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -5217,6 +6904,19 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f32(ptr addrspace(1) %out, i
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_tid_tid_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %v = call float @llvm.amdgcn.permlane16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -5306,6 +7006,34 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_i64(ptr addrspace(1) %out, i
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_tid_tid_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -5399,6 +7127,36 @@ define amdgpu_kernel void @v_permlane16_b32_tid_tid_f64(ptr addrspace(1) %out, f
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_tid_tid_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -5442,6 +7200,19 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_undef_tid_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -5484,6 +7255,19 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_undef_tid_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -5572,6 +7356,34 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_undef_tid_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -5666,6 +7478,36 @@ define amdgpu_kernel void @v_permlane16_b32_undef_tid_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_undef_tid_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -5753,6 +7595,33 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -5837,6 +7706,33 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f32(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %v = call float @llvm.amdgcn.permlane16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -5928,6 +7824,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_i64(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %v = call i64 @llvm.amdgcn.permlane16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -6029,6 +7955,38 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_f64(ptr addrspace(1) %out, i32
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -6072,6 +8030,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_i_tid_fi_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
@@ -6114,6 +8085,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_i_tid_fi_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -6202,6 +8186,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -6296,6 +8308,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -6340,6 +8382,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_i_tid_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
@@ -6382,6 +8437,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_i_tid_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -6470,6 +8538,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -6564,6 +8660,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -6608,6 +8734,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_i_tid_fi_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlane16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
@@ -6650,6 +8789,19 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %ou
 ; GFX12-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlane16_b32_i_tid_fi_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -6738,6 +8890,34 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %ou
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v2, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -6832,6 +9012,36 @@ define amdgpu_kernel void @v_permlane16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %ou
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlane16_b32_i_tid_fi_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -6876,6 +9086,19 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_tid_tid_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %tidx, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -6917,6 +9140,19 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_tid_tid_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %v = call float @llvm.amdgcn.permlanex16.f32(float %tidx_f32, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7006,6 +9242,34 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_tid_tid_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 %tidx_i64, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7099,6 +9363,36 @@ define amdgpu_kernel void @v_permlanex16_b32_tid_tid_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_tid_tid_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -7142,6 +9436,19 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_undef_tid_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7184,6 +9491,19 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f32(ptr addrspace(1) %out
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_undef_tid_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -7272,6 +9592,34 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_i64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_undef_tid_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -7366,6 +9714,36 @@ define amdgpu_kernel void @v_permlanex16_b32_undef_tid_f64(ptr addrspace(1) %out
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_undef_tid_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -7453,6 +9831,33 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i32(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3039
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 12345, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 false)
   store i32 %v, ptr addrspace(1) %out
@@ -7537,6 +9942,33 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f32(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v1, 0x449a5000 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b32 v2, v1, s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v1, 0x449a5000
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-GISEL-NEXT:    global_store_b32 v0, v1, s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %v = call float @llvm.amdgcn.permlanex16.f32(float 1234.5, float %tidx_f32, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7628,6 +10060,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_i64(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, 0x3039
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v1, 0x3039 :: v_dual_mov_b32 v2, 0
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v3, v[1:2], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %v = call i64 @llvm.amdgcn.permlanex16.i64(i64 12345, i64 %tidx_i64, i32 %src1, i32 %src2, i1 false, i1 false)
@@ -7729,6 +10191,38 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_f64(ptr addrspace(1) %out, i3
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_dual_mov_b32 v3, 0x40934a00 :: v_dual_mov_b32 v2, 0
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v4, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v0, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40934a00
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v0, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v1, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v4, v[2:3], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -7772,6 +10266,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 false)
@@ -7814,6 +10321,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -7902,6 +10422,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -7996,6 +10544,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,0]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -8040,6 +10618,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_i_tid_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 false, i1 true)
@@ -8082,6 +10673,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f32(ptr addrspace(1) %out,
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_i_tid_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -8170,6 +10774,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_i64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -8264,6 +10896,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_bc_f64(ptr addrspace(1) %out,
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[0,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -8308,6 +10970,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_bc_i32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %undef = freeze i32 poison
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %undef, i32 %tidx, i32 %src1, i32 %src2, i1 true, i1 true)
@@ -8350,6 +11025,19 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f32(ptr addrspace(1) %o
 ; GFX12-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
 ; GFX12-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_b32_i_tid_fi_bc_f32:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_clause 0x1
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    v_mov_b32_e32 v1, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX13-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-NEXT:    global_store_b32 v1, v0, s[2:3]
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %undef = freeze float poison
@@ -8438,6 +11126,34 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_i64(ptr addrspace(1) %o
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v2, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_i64 = zext i32 %tidx to i64
   %undef = freeze i64 poison
@@ -8532,6 +11248,36 @@ define amdgpu_kernel void @v_permlanex16_b32_i_tid_fi_bc_f64(ptr addrspace(1) %o
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
 ; GFX12-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-NEXT:    s_endpgm
+;
+; GFX13-SDAG-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-SDAG-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-SDAG-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-SDAG-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-SDAG-NEXT:    s_endpgm
+;
+; GFX13-GISEL-LABEL: v_permlanex16_b32_i_tid_fi_bc_f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x30 nv
+; GFX13-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24 nv
+; GFX13-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX13-GISEL-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v0, v0, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v1, v1, s0, s1 op_sel:[1,1]
+; GFX13-GISEL-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
+; GFX13-GISEL-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %tidx_f32 = bitcast i32 %tidx to float
   %tidx_f64 = fpext float %tidx_f32 to double
@@ -8575,6 +11321,20 @@ define void @v_permlane16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i32
 ; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlane16_half:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call half @llvm.amdgcn.permlane16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store half %v, ptr addrspace(1) %out
   ret void
@@ -8614,6 +11374,20 @@ define void @v_permlanex16_half(ptr addrspace(1) %out, half %src0, i32 %src1, i3
 ; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlanex16_half:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call half @llvm.amdgcn.permlanex16.f16(half %src0, half %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store half %v, ptr addrspace(1) %out
   ret void
@@ -8653,6 +11427,20 @@ define void @v_permlane16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1,
 ; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlane16_bfloat:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call bfloat @llvm.amdgcn.permlane16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store bfloat %v, ptr addrspace(1) %out
   ret void
@@ -8692,6 +11480,20 @@ define void @v_permlanex16_bfloat(ptr addrspace(1) %out, bfloat %src0, i32 %src1
 ; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlanex16_bfloat:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call bfloat @llvm.amdgcn.permlanex16.f16(bfloat %src0, bfloat %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store bfloat %v, ptr addrspace(1) %out
   ret void
@@ -8731,6 +11533,20 @@ define void @v_permlane16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32 %
 ; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlane16_i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call i16 @llvm.amdgcn.permlane16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i16 %v, ptr addrspace(1) %out
   ret void
@@ -8770,6 +11586,20 @@ define void @v_permlanex16_i16(ptr addrspace(1) %out, i16 %src0, i32 %src1, i32
 ; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlanex16_i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call i16 @llvm.amdgcn.permlanex16.i16(i16 %src0, i16 %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store i16 %v, ptr addrspace(1) %out
   ret void
@@ -8809,6 +11639,20 @@ define void @v_permlane16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %sr
 ; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlane16_v2f16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <2 x half> @llvm.amdgcn.permlane16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <2 x half> %v, ptr addrspace(1) %out
   ret void
@@ -8848,6 +11692,20 @@ define void @v_permlanex16_v2f16(ptr addrspace(1) %out, <2 x half> %src0, i32 %s
 ; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlanex16_v2f16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <2 x half> @llvm.amdgcn.permlanex16.v2f16(<2 x half> %src0, <2 x half> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <2 x half> %v, ptr addrspace(1) %out
   ret void
@@ -8927,6 +11785,36 @@ define void @v_permlane16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %s
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane16_v2f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane16_v2f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <2 x float> @llvm.amdgcn.permlane16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <2 x float> %v, ptr addrspace(1) %out
   ret void
@@ -9006,6 +11894,36 @@ define void @v_permlanex16_v2f32(ptr addrspace(1) %out, <2 x float> %src0, i32 %
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlanex16_v2f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlanex16_v2f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <2 x float> @llvm.amdgcn.permlanex16.v2f32(<2 x float> %src0, <2 x float> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <2 x float> %v, ptr addrspace(1) %out
   ret void
@@ -9125,6 +12043,50 @@ define void @v_permlane16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %src
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane16_v7i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane16_v7i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <7 x i32> @llvm.amdgcn.permlane16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <7 x i32> %v, ptr addrspace(1) %out
   ret void
@@ -9244,6 +12206,50 @@ define void @v_permlanex16_v7i32(ptr addrspace(1) %out, <7 x i32> %src0, i32 %sr
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlanex16_v7i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlanex16_v7i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <7 x i32> @llvm.amdgcn.permlanex16.v7i32(<7 x i32> %src0, <7 x i32> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <7 x i32> %v, ptr addrspace(1) %out
   ret void
@@ -9292,6 +12298,23 @@ define void @v_permlane16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %src
 ; GFX12-NEXT:    v_permlane16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlane16_v8i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <8 x i16> @llvm.amdgcn.permlane16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <8 x i16> %v, ptr addrspace(1) %out
   ret void
@@ -9340,6 +12363,23 @@ define void @v_permlanex16_v8i16(ptr addrspace(1) %out, <8 x i16> %src0, i32 %sr
 ; GFX12-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
 ; GFX12-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-LABEL: v_permlanex16_v8i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <8 x i16> @llvm.amdgcn.permlanex16.v8i16(<8 x i16> %src0, <8 x i16> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <8 x i16> %v, ptr addrspace(1) %out
   ret void
@@ -9431,6 +12471,40 @@ define void @v_permlane16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %src
 ; GFX12-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane16_v2i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane16_v2i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <2 x i64> @llvm.amdgcn.permlane16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <2 x i64> %v, ptr addrspace(1) %out
   ret void
@@ -9544,6 +12618,48 @@ define void @v_permlane16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %src
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane16_v3i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane16_v3i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <3 x i64> @llvm.amdgcn.permlane16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <3 x i64> %v, ptr addrspace(1) %out
   ret void
@@ -9669,6 +12785,52 @@ define void @v_permlane16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32 %
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane16_v4f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane16_v4f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <4 x double> @llvm.amdgcn.permlane16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <4 x double> %v, ptr addrspace(1) %out
   ret void
@@ -9854,6 +13016,72 @@ define void @v_permlane16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32 %
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane16_v8f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v17, v17, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v16, v16, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v15, v15, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v14, v14, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v13, v13, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v12, v12, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v11, v11, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v10, v10, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x3
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane16_v8f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v10, v10, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v11, v11, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v12, v12, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v13, v13, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v14, v14, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v15, v15, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v16, v16, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane16_b32 v17, v17, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x3
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <8 x double> @llvm.amdgcn.permlane16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <8 x double> %v, ptr addrspace(1) %out
   ret void
@@ -9945,6 +13173,40 @@ define void @v_permlanex16_v2i64(ptr addrspace(1) %out, <2 x i64> %src0, i32 %sr
 ; GFX12-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlanex16_v2i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlanex16_v2i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <2 x i64> @llvm.amdgcn.permlanex16.v2i64(<2 x i64> %src0, <2 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <2 x i64> %v, ptr addrspace(1) %out
   ret void
@@ -10058,6 +13320,48 @@ define void @v_permlanex16_v3i64(ptr addrspace(1) %out, <3 x i64> %src0, i32 %sr
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlanex16_v3i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlanex16_v3i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <3 x i64> @llvm.amdgcn.permlanex16.v3i64(<3 x i64> %src0, <3 x i64> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <3 x i64> %v, ptr addrspace(1) %out
   ret void
@@ -10183,6 +13487,52 @@ define void @v_permlanex16_v4f64(ptr addrspace(1) %out, <4 x double> %src0, i32
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlanex16_v4f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlanex16_v4f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <4 x double> @llvm.amdgcn.permlanex16.v4f64(<4 x double> %src0, <4 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <4 x double> %v, ptr addrspace(1) %out
   ret void
@@ -10368,6 +13718,72 @@ define void @v_permlanex16_v8f64(ptr addrspace(1) %out, <8 x double> %src0, i32
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
 ; GFX12-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
 ; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlanex16_v8f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v17, v17, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v16, v16, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v15, v15, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v14, v14, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v13, v13, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v12, v12, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v11, v11, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v10, v10, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x3
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlanex16_v8f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v10, v10, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v11, v11, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v12, v12, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v13, v13, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v14, v14, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v15, v15, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v16, v16, s0, s1
+; GFX13-GISEL-NEXT:    v_permlanex16_b32 v17, v17, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x3
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
   %v = call <8 x double> @llvm.amdgcn.permlanex16.v8f64(<8 x double> %src0, <8 x double> %src0, i32 %src1, i32 %src2, i1 false, i1 false)
   store <8 x double> %v, ptr addrspace(1) %out
   ret void
@@ -10430,6 +13846,25 @@ define amdgpu_kernel void @v_permlanex16_convergent(ptr addrspace(1) %out, i32 %
 ; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX12-NEXT:  .LBB142_2: ; %f
 ; GFX12-NEXT:    s_endpgm
+;
+; GFX13-LABEL: v_permlanex16_convergent:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c nv
+; GFX13-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_mov_b32_e32 v1, s0
+; GFX13-NEXT:    s_mov_b32 s0, exec_lo
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlanex16_b32 v1, v1, s1, s2
+; GFX13-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX13-NEXT:    s_cbranch_execz .LBB142_2
+; GFX13-NEXT:  ; %bb.1: ; %t
+; GFX13-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24 nv
+; GFX13-NEXT:    v_mov_b32_e32 v0, 0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX13-NEXT:  .LBB142_2: ; %f
+; GFX13-NEXT:    s_endpgm
   %tidx = call i32 @llvm.amdgcn.workitem.id.x()
   %v = call i32 @llvm.amdgcn.permlanex16.i32(i32 %src0, i32 %src0, i32 %pattern_lo, i32 %pattern_hi, i1 false, i1 false)
   %select = icmp eq i32 %tidx, 0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll
new file mode 100644
index 0000000000000..0290764b9fe00
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.up.ll
@@ -0,0 +1,1105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s
+
+define i32 @v_permlane_up_b32_vss(i32 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_up_b32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_b32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %src1, i32 %src2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_up_b32_vii(i32 %src0) {
+; GFX1250-LABEL: v_permlane_up_b32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_b32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 1, i32 2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_up_b32_vll(i32 %src0) {
+; GFX1250-LABEL: v_permlane_up_b32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_b32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 100, i32 102)
+  ret i32 %v
+}
+
+define i32 @v_permlane_up_b32_vvv(i32 %src0) {
+; GFX1250-LABEL: v_permlane_up_b32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_b32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane.up(i32 %src0, i32 %tidx, i32 %tidy)
+  ret i32 %v
+}
+
+define float @v_permlane_up_f32_vss(float %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_up_f32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 %src1, i32 %src2)
+  ret float %v
+}
+
+define float @v_permlane_up_f32_vii(float %src0) {
+; GFX1250-LABEL: v_permlane_up_f32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 1, i32 2)
+  ret float %v
+}
+
+define float @v_permlane_up_f32_vll(float %src0) {
+; GFX1250-LABEL: v_permlane_up_f32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 100, i32 102)
+  ret float %v
+}
+
+define float @v_permlane_up_f32_vvv(float %src0) {
+; GFX1250-LABEL: v_permlane_up_f32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlane.up(float %src0, i32 %tidx, i32 %tidy)
+  ret float %v
+}
+
+define i64 @v_permlane_up_i64_vss(i64 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_up_i64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_i64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 %src1, i32 %src2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_up_i64_vii(i64 %src0) {
+; GFX1250-LABEL: v_permlane_up_i64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_i64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 1, i32 2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_up_i64_vll(i64 %src0) {
+; GFX1250-LABEL: v_permlane_up_i64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_i64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 100, i32 102)
+  ret i64 %v
+}
+
+define i64 @v_permlane_up_i64_vvv(i64 %src0) {
+; GFX1250-LABEL: v_permlane_up_i64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_i64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlane.up(i64 %src0, i32 %tidx, i32 %tidy)
+  ret i64 %v
+}
+
+define double @v_permlane_up_f64_vss(double %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_up_f64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 %src1, i32 %src2)
+  ret double %v
+}
+
+define double @v_permlane_up_f64_vii(double %src0) {
+; GFX1250-LABEL: v_permlane_up_f64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 1, i32 2)
+  ret double %v
+}
+
+define double @v_permlane_up_f64_vll(double %src0) {
+; GFX1250-LABEL: v_permlane_up_f64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 100, i32 102)
+  ret double %v
+}
+
+define double @v_permlane_up_f64_vvv(double %src0) {
+; GFX1250-LABEL: v_permlane_up_f64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_f64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_up_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlane.up(double %src0, i32 %tidx, i32 %tidy)
+  ret double %v
+}
+
+; does not work for GISEL
+;define void @v_permlane_up_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) {
+; %v = call bfloat @llvm.amdgcn.permlane.up.bf16(bfloat %src, i32 %src1, i32 %src2)
+; store bfloat %v, ptr addrspace(1) %out, align 4
+; ret void
+;}
+
+define void @v_permlane_up_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_up_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane.up.i16(i16 %src, i32 %src1, i32 %src2)
+  store i16 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_up_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_up_v2f16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlane.up.v2f16(<2 x half> %src, i32 %src1, i32 %src2)
+  store <2 x half> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v2f32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v2f32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v2f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v2f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlane.up.v2f32(<2 x float> %src, i32 %src1, i32 %src2)
+  store <2 x float> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v7i32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v7i32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v7i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v7i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlane.up.v7i32(<7 x i32> %src, i32 %src1, i32 %src2)
+  store <7 x i32> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v8i16:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v8i16:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v8i16:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v8i16:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlane.up.v8i16(<8 x i16> %src, i32 %src1, i32 %src2)
+  store <8 x i16> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v2i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v2i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v2i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v2i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x i64> @llvm.amdgcn.permlane.up.v2i64(<2 x i64> %src, i32 %src1, i32 %src2)
+  store <2 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v3i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v3i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v3i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v3i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <3 x i64> @llvm.amdgcn.permlane.up.v2i64(<3 x i64> %src, i32 %src1, i32 %src2)
+  store <3 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v4f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v4f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v4f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v4f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <4 x double> @llvm.amdgcn.permlane.up.v4f64(<4 x double> %src, i32 %src1, i32 %src2)
+  store <4 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_up_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_up_v8f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v17, v17, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v16, v16, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v15, v15, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v14, v14, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v13, v13, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v12, v12, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v11, v11, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v10, v10, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x3
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_up_v8f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v10, v10, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v11, v11, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v12, v12, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v13, v13, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v14, v14, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v15, v15, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v16, v16, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_up_b32 v17, v17, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x3
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_up_v8f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v17, v17, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v16, v16, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v15, v15, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v14, v14, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v13, v13, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v12, v12, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v11, v11, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v10, v10, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x3
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_up_v8f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v10, v10, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v11, v11, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v12, v12, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v13, v13, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v14, v14, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v15, v15, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v16, v16, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_up_b32 v17, v17, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x3
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x double> @llvm.amdgcn.permlane.up.v8f64(<8 x double> %src, i32 %src1, i32 %src2)
+  store <8 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll
new file mode 100644
index 0000000000000..476f2894c29b2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.xor.ll
@@ -0,0 +1,1105 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-SDAG %s
+; RUN: llc -global-isel=1 -new-reg-bank-select -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1310 < %s | FileCheck -check-prefixes=GFX13,GFX13-GISEL %s
+
+define i32 @v_permlane_xor_b32_vss(i32 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_xor_b32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_b32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %src1, i32 %src2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_xor_b32_vii(i32 %src0) {
+; GFX1250-LABEL: v_permlane_xor_b32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_b32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 1, i32 2)
+  ret i32 %v
+}
+
+define i32 @v_permlane_xor_b32_vll(i32 %src0) {
+; GFX1250-LABEL: v_permlane_xor_b32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_b32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 100, i32 102)
+  ret i32 %v
+}
+
+define i32 @v_permlane_xor_b32_vvv(i32 %src0) {
+; GFX1250-LABEL: v_permlane_xor_b32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_b32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i32 @llvm.amdgcn.permlane.xor(i32 %src0, i32 %tidx, i32 %tidy)
+  ret i32 %v
+}
+
+define float @v_permlane_xor_f32_vss(float %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_xor_f32_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f32_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 %src1, i32 %src2)
+  ret float %v
+}
+
+define float @v_permlane_xor_f32_vii(float %src0) {
+; GFX1250-LABEL: v_permlane_xor_f32_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f32_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 1, i32 2)
+  ret float %v
+}
+
+define float @v_permlane_xor_f32_vll(float %src0) {
+; GFX1250-LABEL: v_permlane_xor_f32_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f32_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 100, i32 102)
+  ret float %v
+}
+
+define float @v_permlane_xor_f32_vvv(float %src0) {
+; GFX1250-LABEL: v_permlane_xor_f32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f32_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v1, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v2, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call float @llvm.amdgcn.permlane.xor(float %src0, i32 %tidx, i32 %tidy)
+  ret float %v
+}
+
+define i64 @v_permlane_xor_i64_vss(i64 %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_xor_i64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_i64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 %src1, i32 %src2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_xor_i64_vii(i64 %src0) {
+; GFX1250-LABEL: v_permlane_xor_i64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_i64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 1, i32 2)
+  ret i64 %v
+}
+
+define i64 @v_permlane_xor_i64_vll(i64 %src0) {
+; GFX1250-LABEL: v_permlane_xor_i64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_i64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 100, i32 102)
+  ret i64 %v
+}
+
+define i64 @v_permlane_xor_i64_vvv(i64 %src0) {
+; GFX1250-LABEL: v_permlane_xor_i64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_i64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call i64 @llvm.amdgcn.permlane.xor(i64 %src0, i32 %tidx, i32 %tidy)
+  ret i64 %v
+}
+
+define double @v_permlane_xor_f64_vss(double %src0, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_xor_f64_vss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f64_vss:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 %src1, i32 %src2)
+  ret double %v
+}
+
+define double @v_permlane_xor_f64_vii(double %src0) {
+; GFX1250-LABEL: v_permlane_xor_f64_vii:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, 1, 2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f64_vii:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, 1, 2
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, 1, 2
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 1, i32 2)
+  ret double %v
+}
+
+define double @v_permlane_xor_f64_vll(double %src0) {
+; GFX1250-LABEL: v_permlane_xor_f64_vll:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s0, 0x64
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, s0, 0x66
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f64_vll:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    s_movk_i32 s0, 0x64
+; GFX13-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, 0x66
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, s0, 0x66
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 100, i32 102)
+  ret double %v
+}
+
+define double @v_permlane_xor_f64_vvv(double %src0) {
+; GFX1250-LABEL: v_permlane_xor_f64_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX1250-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX1250-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_f64_vvv:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_and_b32_e32 v2, 0x3ff, v31
+; GFX13-NEXT:    v_bfe_u32 v3, v31, 10, 10
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v0, v0, s0, s1
+; GFX13-NEXT:    v_permlane_xor_b32 v1, v1, s0, s1
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %tidx = call i32 @llvm.amdgcn.workitem.id.x()
+  %tidy = call i32 @llvm.amdgcn.workitem.id.y()
+  %v = call double @llvm.amdgcn.permlane.xor(double %src0, i32 %tidx, i32 %tidy)
+  ret double %v
+}
+
+; does not work for GISEL
+;define void @v_permlane_xor_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1, i32 %src2) {
+; %v = call bfloat @llvm.amdgcn.permlane.xor.bf16(bfloat %src, i32 %src1, i32 %src2)
+; store bfloat %v, ptr addrspace(1) %out, align 4
+; ret void
+;}
+
+define void @v_permlane_xor_i16(ptr addrspace(1) %out, i16 %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_xor_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_i16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call i16 @llvm.amdgcn.permlane.xor.i16(i16 %src, i32 %src1, i32 %src2)
+  store i16 %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %src1, i32 %src2) {
+; GFX1250-LABEL: v_permlane_xor_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX1250-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-LABEL: v_permlane_xor_v2f16:
+; GFX13:       ; %bb.0:
+; GFX13-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-NEXT:    s_wait_expcnt 0x0
+; GFX13-NEXT:    s_wait_samplecnt 0x0
+; GFX13-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-NEXT:    s_wait_kmcnt 0x0
+; GFX13-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX13-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX13-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX13-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x half> @llvm.amdgcn.permlane.xor.v2f16(<2 x half> %src, i32 %src1, i32 %src2)
+  store <2 x half> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v2f32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v2f32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v2f32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v2f32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x float> @llvm.amdgcn.permlane.xor.v2f32(<2 x float> %src, i32 %src1, i32 %src2)
+  store <2 x float> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v7i32:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v7i32:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v7i32:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v7i32:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b96 v[0:1], v[6:8], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <7 x i32> @llvm.amdgcn.permlane.xor.v7i32(<7 x i32> %src, i32 %src1, i32 %src2)
+  store <7 x i32> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v8i16:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v8i16:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v8i16:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v8i16:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x i16> @llvm.amdgcn.permlane.xor.v8i16(<8 x i16> %src, i32 %src1, i32 %src2)
+  store <8 x i16> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v2i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v2i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v2i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v2i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <2 x i64> @llvm.amdgcn.permlane.xor.v2i64(<2 x i64> %src, i32 %src1, i32 %src2)
+  store <2 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v3i64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v3i64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v3i64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v3i64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b64 v[0:1], v[6:7], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <3 x i64> @llvm.amdgcn.permlane.xor.v2i64(<3 x i64> %src, i32 %src1, i32 %src2)
+  store <3 x i64> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v4f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x1
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v4f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x1
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v4f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x1
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v4f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v11
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x1
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <4 x double> @llvm.amdgcn.permlane.xor.v4f64(<4 x double> %src, i32 %src1, i32 %src2)
+  store <4 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+define void @v_permlane_xor_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32 %src1, i32 %src2) {
+; GFX1250-SDAG-LABEL: v_permlane_xor_v8f64:
+; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v17, v17, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v16, v16, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v15, v15, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v14, v14, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v13, v13, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v12, v12, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v11, v11, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v10, v10, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-SDAG-NEXT:    s_clause 0x3
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: v_permlane_xor_v8f64:
+; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v10, v10, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v11, v11, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v12, v12, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v13, v13, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v14, v14, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v15, v15, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v16, v16, s0, s1
+; GFX1250-GISEL-NEXT:    v_permlane_xor_b32 v17, v17, s0, s1
+; GFX1250-GISEL-NEXT:    s_clause 0x3
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX1250-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX1250-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-SDAG-LABEL: v_permlane_xor_v8f64:
+; GFX13-SDAG:       ; %bb.0:
+; GFX13-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-SDAG-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v17, v17, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v16, v16, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v15, v15, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v14, v14, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v13, v13, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v12, v12, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v11, v11, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v10, v10, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-SDAG-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-SDAG-NEXT:    s_clause 0x3
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-SDAG-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-SDAG-NEXT:    s_set_pc_i64 s[30:31]
+;
+; GFX13-GISEL-LABEL: v_permlane_xor_v8f64:
+; GFX13-GISEL:       ; %bb.0:
+; GFX13-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX13-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX13-GISEL-NEXT:    v_readfirstlane_b32 s1, v19
+; GFX13-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v2, v2, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v3, v3, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v4, v4, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v5, v5, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v6, v6, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v7, v7, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v8, v8, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v9, v9, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v10, v10, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v11, v11, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v12, v12, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v13, v13, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v14, v14, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v15, v15, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v16, v16, s0, s1
+; GFX13-GISEL-NEXT:    v_permlane_xor_b32 v17, v17, s0, s1
+; GFX13-GISEL-NEXT:    s_clause 0x3
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[2:5], off
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[6:9], off offset:16
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[10:13], off offset:32
+; GFX13-GISEL-NEXT:    global_store_b128 v[0:1], v[14:17], off offset:48
+; GFX13-GISEL-NEXT:    s_set_pc_i64 s[30:31]
+  %v = call <8 x double> @llvm.amdgcn.permlane.xor.v8f64(<8 x double> %src, i32 %src1, i32 %src2)
+  store <8 x double> %v, ptr addrspace(1) %out, align 4
+  ret void
+}

From 6293f16dd426cb3ab175e4f4c7572d72ad894789 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 14 May 2026 10:26:35 +0100
Subject: [PATCH 48/95] [TableGen] Simplify Record type checks. NFC. (#197450)

---
 llvm/lib/TableGen/Record.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index ce6c63560ed1a..a043119006312 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -810,7 +810,7 @@ std::string ListInit::getAsString() const {
 }
 
 const Init *OpInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get(getRecordKeeper()))
+  if (isa<BitRecTy>(getType()))
     return this;
   return VarBitInit::get(this, Bit);
 }
@@ -2390,7 +2390,7 @@ const RecTy *TypedInit::getFieldType(const StringInit *FieldName) const {
 }
 
 const Init *TypedInit::convertInitializerTo(const RecTy *Ty) const {
-  if (getType() == Ty || getType()->typeIsA(Ty))
+  if (getType()->typeIsA(Ty))
     return this;
 
   if (isa<BitRecTy>(getType()) && isa<BitsRecTy>(Ty) &&
@@ -2419,7 +2419,7 @@ TypedInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
 
 const Init *TypedInit::getCastTo(const RecTy *Ty) const {
   // Handle the common case quickly
-  if (getType() == Ty || getType()->typeIsA(Ty))
+  if (getType()->typeIsA(Ty))
     return this;
 
   if (const Init *Converted = convertInitializerTo(Ty)) {
@@ -2453,7 +2453,7 @@ StringRef VarInit::getName() const {
 }
 
 const Init *VarInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get(getRecordKeeper()))
+  if (isa<BitRecTy>(getType()))
     return this;
   return VarBitInit::get(this, Bit);
 }
@@ -2646,7 +2646,7 @@ const FieldInit *FieldInit::get(const Init *R, const StringInit *FN) {
 }
 
 const Init *FieldInit::getBit(unsigned Bit) const {
-  if (getType() == BitRecTy::get(getRecordKeeper()))
+  if (isa<BitRecTy>(getType()))
     return this;
   return VarBitInit::get(this, Bit);
 }
@@ -2932,7 +2932,7 @@ StringRef RecordVal::getName() const {
 }
 
 std::string RecordVal::getPrintType() const {
-  if (getType() == StringRecTy::get(getRecordKeeper())) {
+  if (isa<StringRecTy>(getType())) {
     if (const auto *StrInit = dyn_cast<StringInit>(Value)) {
       if (StrInit->hasCodeFormat())
         return "code";

From d566c8c8feb3cca568943a68245e2422b8abb02f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Sch=C3=A4pers?= <bjoern@hazardy.de>
Date: Thu, 14 May 2026 11:26:39 +0200
Subject: [PATCH 49/95] [clang-format] Fix parsing of goto labels (#197538)

Fixes #196662.

---------

Co-authored-by: owenca <owenpiano@gmail.com>
---
 clang/lib/Format/UnwrappedLineParser.cpp |  6 +--
 clang/unittests/Format/FormatTest.cpp    | 59 ++++++++++++++++--------
 2 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 2da8cf93d4a0a..9536a233def58 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -3374,6 +3374,7 @@ void UnwrappedLineParser::parseDoWhile() {
 
 void UnwrappedLineParser::parseLabel(
     FormatStyle::IndentGotoLabelStyle IndentGotoLabels) {
+  const bool IsGotoLabel = FormatTok->is(TT_GotoLabelColon);
   nextToken();
   unsigned OldLineLevel = Line->Level;
 
@@ -3390,9 +3391,8 @@ void UnwrappedLineParser::parseLabel(
     break;
   }
 
-  if (!Style.IndentCaseBlocks && CommentsBeforeNextToken.empty() &&
-      FormatTok->is(tok::l_brace)) {
-
+  if (!IsGotoLabel && !Style.IndentCaseBlocks &&
+      CommentsBeforeNextToken.empty() && FormatTok->is(tok::l_brace)) {
     CompoundStatementIndenter Indenter(this, Line->Level,
                                        Style.BraceWrapping.AfterCaseLabel,
                                        Style.BraceWrapping.IndentBraces);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index dbc8a00ad1c9b..54529a3d4e590 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -3139,14 +3139,18 @@ TEST_F(FormatTest, FormatsLabels) {
                "}");
   verifyFormat("{\n"
                "  some_code();\n"
-               "test_label: { some_other_code(); }\n"
+               "test_label:\n"
+               "  {\n"
+               "    some_other_code();\n"
+               "  }\n"
                "}");
   verifyFormat("{\n"
                "  some_code();\n"
-               "test_label: {\n"
-               "  some_other_code();\n"
-               "  some_other_code();\n"
-               "}\n"
+               "test_label:\n"
+               "  {\n"
+               "    some_other_code();\n"
+               "    some_other_code();\n"
+               "  }\n"
                "}");
   verifyFormat("{\n"
                "L0:\n"
@@ -3155,10 +3159,11 @@ TEST_F(FormatTest, FormatsLabels) {
                "  g();\n"
                "}");
   verifyFormat("{\n"
-               "[[foo]] L1: {\n"
-               "[[bar]] [[baz]] L2:\n"
-               "  g();\n"
-               "}\n"
+               "[[foo]] L1:\n"
+               "  {\n"
+               "  [[bar]] [[baz]] L2:\n"
+               "    g();\n"
+               "  }\n"
                "}");
   verifyFormat("{\n"
                "[[foo]] L1:\n"
@@ -3168,6 +3173,18 @@ TEST_F(FormatTest, FormatsLabels) {
                "    g();\n"
                "  }\n"
                "}");
+  verifyFormat("void func() {\n"
+               "label:\n"
+               "  {\n"
+               "    // Block\n"
+               "  }\n"
+               "}");
+  verifyFormat("void func() {\n"
+               "label: // Comment\n"
+               "  {\n"
+               "    // Block\n"
+               "  }\n"
+               "}");
 
   FormatStyle Style = getLLVMStyle();
   Style.IndentGotoLabels = FormatStyle::IGLS_NoIndent;
@@ -3196,7 +3213,10 @@ TEST_F(FormatTest, FormatsLabels) {
                Style);
   verifyFormat("{\n"
                "  some_code();\n"
-               "test_label: { some_other_code(); }\n"
+               "test_label:\n"
+               "  {\n"
+               "    some_other_code();\n"
+               "  }\n"
                "}",
                Style);
   verifyFormat("{\n"
@@ -3331,17 +3351,17 @@ TEST_F(FormatTest, FormatsLabels) {
   verifyFormat("{\n"
                "  some_code();\n"
                "test_label:\n"
-               "{\n"
-               "  some_other_code();\n"
-               "}\n"
+               "  {\n"
+               "    some_other_code();\n"
+               "  }\n"
                "}",
                Style);
   verifyFormat("{\n"
                "[[foo]] L1:\n"
-               "{\n"
-               "[[bar]] [[baz]] L2:\n"
-               "  g();\n"
-               "}\n"
+               "  {\n"
+               "  [[bar]] [[baz]] L2:\n"
+               "    g();\n"
+               "  }\n"
                "}",
                Style);
 }
@@ -18738,7 +18758,10 @@ TEST_F(FormatTest, ConfigurableSpaceBeforeColon) {
                "}",
                CaseStyle);
   verifyFormat("switch (x) {\n"
-               "goto_label: { break; }\n"
+               "goto_label:\n"
+               "  {\n"
+               "    break;\n"
+               "  }\n"
                "default : {\n"
                "  break;\n"
                "}\n"

From 1cb92d817468c6fbe1b9c6905bcf84f712de742c Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 14 May 2026 10:28:46 +0100
Subject: [PATCH 50/95] [compiler-rt][ARM] Optimized single-precision FP
 comparisons (#179925)

These comparison functions follow the same structure as the
double-precision ones in a prior commit, of a header file containing the
main logic and some entry points varying the construction of the return
value.

In this case, we have provided versions for Thumb1 as well as
Arm/Thumb2.
---
 compiler-rt/lib/builtins/CMakeLists.txt       |  16 +-
 compiler-rt/lib/builtins/arm/cmpsf2.S         |  56 +++
 compiler-rt/lib/builtins/arm/fcmp.h           | 176 +++++++
 compiler-rt/lib/builtins/arm/gesf2.S          |  54 +++
 compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S  |  55 +++
 compiler-rt/lib/builtins/arm/thumb1/fcmp.h    | 189 ++++++++
 compiler-rt/lib/builtins/arm/thumb1/gesf2.S   |  54 +++
 .../lib/builtins/arm/thumb1/unordsf2.S        |  49 ++
 compiler-rt/lib/builtins/arm/unordsf2.S       |  56 +++
 .../test/builtins/Unit/comparesf2new_test.c   | 443 ++++++++++++++++++
 10 files changed, 1147 insertions(+), 1 deletion(-)
 create mode 100644 compiler-rt/lib/builtins/arm/cmpsf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/fcmp.h
 create mode 100644 compiler-rt/lib/builtins/arm/gesf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/fcmp.h
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/gesf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/thumb1/unordsf2.S
 create mode 100644 compiler-rt/lib/builtins/arm/unordsf2.S
 create mode 100644 compiler-rt/test/builtins/Unit/comparesf2new_test.c

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index d5034cd4d286a..3fa21578c86ad 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -472,8 +472,11 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
       arm/muldf3.S
       arm/divdf3.S
       arm/cmpdf2.S
+      arm/cmpsf2.S
       arm/gedf2.S
+      arm/gesf2.S
       arm/unorddf2.S
+      arm/unordsf2.S
       )
     set_source_files_properties(${assembly_files}
       PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
@@ -517,7 +520,6 @@ set(arm_sync_SOURCES
 set(thumb1_base_SOURCES
   arm/divsi3.S
   arm/udivsi3.S
-  arm/comparesf2.S
   arm/addsf3.S
   ${GENERIC_SOURCES}
 )
@@ -527,8 +529,11 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP)
   set(thumb1_base_SOURCES
     arm/thumb1/mulsf3.S
     arm/thumb1/cmpdf2.S
+    arm/thumb1/cmpsf2.S
     arm/thumb1/gedf2.S
+    arm/thumb1/gesf2.S
     arm/thumb1/unorddf2.S
+    arm/thumb1/unordsf2.S
     arm/fnan2.c
     arm/fnorm2.c
     arm/funder.c
@@ -536,6 +541,15 @@ if(COMPILER_RT_ARM_OPTIMIZED_FP)
   )
   set_special_properties(arm/thumb1/cmpdf2.S
     SUPERSEDES comparedf2.c PROVIDES comparedf2)
+  set_special_properties(arm/thumb1/cmpsf2.S
+    SUPERSEDES comparesf2.c PROVIDES comparesf2)
+else()
+  # Other Thumb1 assembly implementations which do not fall under the
+  # COMPILER_RT_ARM_OPTIMIZED_FP umbrella
+  set(thumb1_base_SOURCES
+    arm/comparesf2.S
+    ${thumb1_base_SOURCES}
+  )
 endif()
 
 set(arm_EABI_RT_SOURCES
diff --git a/compiler-rt/lib/builtins/arm/cmpsf2.S b/compiler-rt/lib/builtins/arm/cmpsf2.S
new file mode 100644
index 0000000000000..14166246101af
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/cmpsf2.S
@@ -0,0 +1,56 @@
+//===-- cmpsf2.S - single-precision floating point comparison -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpsf2: it's a three-way compare
+// which returns <0 if x<y, 0 if x==y, and >0 if x>y. If the result is
+// unordered (i.e. x or y or both is NaN) then it returns >0.
+//
+// This also makes it suitable for use as all of __eqsf2, __nesf2, __ltsf2 or
+// __lesf2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0 .req r0
+op1 .req r1
+.macro SetReturnRegister
+  mov r0, #0
+  movhi r0, #1
+  movlo r0, #-1
+.endm
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__cmpsf2)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __compiler_rt_softfp_cmpsf2
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpsf2, __compiler_rt_softfp_cmpsf2)
+#endif
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__lesf2, __cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltsf2, __cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqsf2, __cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__nesf2, __cmpsf2)
+
+DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpsf2)
+  #include "fcmp.h"
+
+LOCAL_LABEL(NaN):
+  mov r0, #+1
+  bx lr
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpsf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/fcmp.h b/compiler-rt/lib/builtins/arm/fcmp.h
new file mode 100644
index 0000000000000..4860479f45158
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/fcmp.h
@@ -0,0 +1,176 @@
+//===-- fcmp.h - shared code for single-precision FP comparison functions -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This code is the skeleton of a double-precision FP compare, with two details
+// left out: which input value is in which register, and how to make the return
+// value. It allows the main comparison logic to be shared between (for
+// example) __lesf2 and __gesf2, varying only those details.
+//
+//===----------------------------------------------------------------------===//
+
+// How to use this header file:
+//
+// This header file is expected to be #included from inside a function
+// definition in a .S file. The source file including this header should
+// provide the following:
+//
+// op0 and op1: register aliases (via .req) for the registers containing the
+// input operands.
+//  - For most comparisons, op0 will correspond to r0 and op1 to r1.
+//  - But a function with the reversed semantics of __aeabi_cfrcmple wil define
+//    them the other way round.
+//
+// SetReturnRegister: an assembly macro that looks at the PSR flags and sets up
+// an appropriate return value in r0, for the cases that do *not* involve NaN.
+//  - On entry to this macro, the condition codes LO, EQ and HI indicate that
+//    op0 < op1, op0 == op1 or op0 > op1 respectively.
+//  - For functions that return a result in the flags, this macro can be empty,
+//    because those are the correct flags to return anyway.
+//  - Functions that return a boolean in r0 should set it up by checking the
+//    flags.
+//
+// LOCAL_LABEL(NaN): a label defined within the compare function, after the
+// #include of this header. Called when at least one input is a NaN, and sets
+// up the appropriate return value for that case.
+
+// --------------------------------------------------
+// The actual entry point of the compare function.
+//
+// The basic plan is to start by ORing together the two inputs. This tells us
+// two things:
+//  - the top bit of the output tells us whether both inputs are positive, or
+//    whether at least one is negative
+//  - if the 8 exponent bits of the output are not all 1, then there are
+//    definitely no NaNs, so a fast path can handle most non-NaN cases.
+
+// clang-format off
+
+  // First diverge control for the negative-numbers case.
+  orrs    r12, op0, op1
+  bmi     LOCAL_LABEL(negative)         // high bit set => at least one negative input
+
+  // Here, both inputs are positive. Try adding 1<<23 to their bitwise OR in
+  // r12. This will carry all the way into the top bit, setting the N flag, if
+  // all 8 exponent bits were set.
+  cmn     r12, #1 << 23
+  bmi     LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs
+
+  // The fastest fast path: both inputs positive and we could easily tell there
+  // were no NaNs. So we just compare op0 and op1 as unsigned integers.
+  cmp     op0, op1
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(NaNInf_check_positive):
+  // Second tier for positive numbers. We come here if both inputs are
+  // positive, but our fast initial check didn't manage to rule out a NaN. But
+  // it's not guaranteed that there _is_ a NaN, for two reasons:
+  //
+  //  1. An input with exponent 0xFF might be an infinity instead. Those behave
+  //    normally under comparison.
+  //
+  //  2. There might not even _be_ an input with exponent 0xFF. All we know so
+  //     far is that the two inputs ORed together had all the exponent bits
+  //     set. So each of those bits is set in _at least one_ of the inputs, but
+  //     not necessarily all in the _same_ input.
+  //
+  // Test each exponent individually for 0xFF, using the same CMN idiom as
+  // above. If neither one carries into the sign bit then we have no NaNs _or_
+  // infinities and can compare the registers and return again.
+  cmn     op0, #1 << 23
+  cmnpl   op1, #1 << 23
+  bmi     LOCAL_LABEL(NaN_check_positive)
+
+  // Second-tier return path, now we've ruled out anything difficult.
+  cmp     op0, op1
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(NaN_check_positive):
+  // Third tier for positive numbers. Here we know that at least one of the
+  // inputs has exponent 0xFF. But they might still be infinities rather than
+  // NaNs. So now we must check whether there's an actual NaN, by shifting each
+  // input left to get rid of the sign bit, and seeing if the result is
+  // _greater_ than 0xFF000000 (but not equal).
+  //
+  // We could have skipped the second-tier check and done this more rigorous
+  // test immediately. But that would cost an extra instruction in the case
+  // where there are no infinities or NaNs, and we assume that that is so much
+  // more common that it's worth optimizing for.
+  mov     r12, #0xFF << 24
+  cmp     r12, op0, LSL #1   // if LO, then r12 < (op0 << 1), so op0 is a NaN
+  cmphs   r12, op1, LSL #1   // if not LO, then do the same check for op1
+  blo     LOCAL_LABEL(NaN)           // now, if LO, there's definitely a NaN
+
+  // Now we've finally ruled out NaNs! And we still know both inputs are
+  // positive. So the third-tier return path can just compare the numbers
+  // again.
+  cmp     op0, op1
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(negative):
+  // We come here if at least one operand is negative. We haven't checked for
+  // NaNs at all yet (the sign check came first), so repeat the first-tier
+  // check strategy of seeing if all exponent bits are set in r12.
+  //
+  // On this path, the sign bit in r12 is set, so if adding 1 to the low
+  // exponent bit carries all the way through into the sign bit, it will
+  // _clear_ the sign bit rather than setting it. So we expect MI to be the
+  // "definitely no NaNs" result, where it was PL on the positive branch.
+  cmn     r12, #1 << 23
+  bpl     LOCAL_LABEL(NaNInf_check_negative)
+
+  // Now we have no NaNs, but at least one negative number. This gives us two
+  // complications:
+  //
+  //  1. Floating-point numbers are sign/magnitude, not two's complement, so we
+  //     have to consider separately the cases of "both negative" and "one of
+  //     each sign".
+  //
+  //  2. -0 and +0 are required to compare equal.
+  //
+  // But problem #1 is not as hard as it sounds! If both operands are negative,
+  // then we can get the result we want by comparing them as unsigned integers
+  // the opposite way round, because the input with the smaller value (as an
+  // integer) is the larger number in an FP ordering sense. And if one operand
+  // is negative and the other is positive, the _same_ reversed comparison
+  // works, because the positive number (with zero sign bit) will always
+  // compare less than the negative one in an unsigned-integers sense.
+  //
+  // So we only have to worry about problem #2, signed zeroes. This only
+  // affects the answer if _both_ operands are zero. And we can check that
+  // easily, because it happens if and only if r12 = 0x80000000. (We know r12
+  // has its sign bit set; if it has no other bits set, that's because both
+  // inputs were either 0x80000000 or 0x00000000.)
+  cmp     r12, #0x80000000        // EQ if both inputs are zero
+  cmpne   op1, op0                // otherwise, compare them backwards
+  SetReturnRegister
+  bx      lr
+
+LOCAL_LABEL(NaNInf_check_negative):
+  // Second tier for negative numbers: we know the OR of the exponents is 0xFF,
+  // but again, we might not have either _actual_ exponent 0xFF, and also, an
+  // exponent 0xFF might be an infinity instead of a NaN.
+  //
+  // On this path we've already branched twice (once for negative numbers and
+  // once for the first-tier NaN check), so we'll just go straight to the
+  // precise check for NaNs.
+  mov     r12, #0xFF << 24
+  cmp     r12, op0, LSL #1   // if LO, then r12 < (op0 << 1), so op0 is a NaN
+  cmphs   r12, op1, LSL #1   // if not LO, then do the same check for op1
+  blo     LOCAL_LABEL(NaN)
+
+  // Now we've ruled out NaNs, so we can just compare the two input registers
+  // and return. On this path we _don't_ need to check for the special case of
+  // comparing two zeroes, because we only came here if the bitwise OR of the
+  // exponent fields was 0xFF, which means the exponents can't both have been
+  // zero! So we can _just_ do the reversed CMP and finish.
+  cmp     op1, op0
+  SetReturnRegister
+  bx      lr
diff --git a/compiler-rt/lib/builtins/arm/gesf2.S b/compiler-rt/lib/builtins/arm/gesf2.S
new file mode 100644
index 0000000000000..c149eea589f05
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/gesf2.S
@@ -0,0 +1,54 @@
+//===-- gesf2.S - single-precision floating point comparison --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpsf2, except for its NaN
+// handling. It's a three-way compare which returns <0 if x<y, 0 if x==y, and
+// >0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it
+// returns <0, where __cmpsf2 would return >0.
+//
+// This also makes it suitable for use as __gtsf2 or __gesf2 (or __eqsf2 or
+// __nesf2).
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0 .req r0
+op1 .req r1
+.macro SetReturnRegister
+  mov r0, #0
+  movhi r0, #1
+  movlo r0, #-1
+.endm
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__gesf2)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __compiler_rt_softfp_gesf2
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gesf2, __compiler_rt_softfp_gesf2)
+#endif
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtsf2, __gesf2)
+
+DEFINE_COMPILERRT_FUNCTION(__compiler_rt_softfp_gesf2)
+  #include "fcmp.h"
+
+LOCAL_LABEL(NaN):
+  mov r0, #-1
+  bx lr
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gesf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S b/compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S
new file mode 100644
index 0000000000000..e4a5e08c35181
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/cmpsf2.S
@@ -0,0 +1,55 @@
+//===-- cmpsf2.S - single-precision floating point comparison -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpsf2: it's a three-way compare
+// which returns <0 if x<y, 0 if x==y, and >0 if x>y. If the result is
+// unordered (i.e. x or y or both is NaN) then it returns >0.
+//
+// This also makes it suitable for use as all of __eqsf2, __nesf2, __ltsf2 or
+// __lesf2.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0 .req r0
+op1 .req r1
+.macro ReturnResult
+  bhi 0f
+  blo 1f
+  movs r0, #0
+  bx lr
+0:
+  movs r0, #1
+  bx lr
+1:
+  movs r0, #1
+  rsbs r0, r0, #0
+  bx lr
+.endm
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__cmpsf2, __compiler_rt_softfp_cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__lesf2, __cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__ltsf2, __cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__eqsf2, __cmpsf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__nesf2, __cmpsf2)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_cmpsf2)
+  #include "fcmp.h"
+
+LOCAL_LABEL(NaN):
+  movs r0, #1
+  bx lr
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_cmpsf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/fcmp.h b/compiler-rt/lib/builtins/arm/thumb1/fcmp.h
new file mode 100644
index 0000000000000..7d85abae05129
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/fcmp.h
@@ -0,0 +1,189 @@
+//===-- fcmp.h - shared code for single-precision FP comparison functions -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This code is the skeleton of a double-precision FP compare, with two details
+// left out: which input value is in which register, and how to make the return
+// value. It allows the main comparison logic to be shared between (for
+// example) __lesf2 and __gesf2, varying only those details.
+//
+//===----------------------------------------------------------------------===//
+
+// How to use this header file:
+//
+// This header file is expected to be #included from inside a function
+// definition in a .S file. The source file including this header should
+// provide the following:
+//
+// op0 and op1: register aliases (via .req) for the registers containing the
+// input operands.
+//  - For most comparisons, op0 will correspond to r0 and op1 to r1.
+//  - But a function with the reversed semantics of __aeabi_cfrcmple wil define
+//    them the other way round.
+//
+// ReturnResult: an assembly macro that looks at the PSR flags, sets up an
+// appropriate return value in r0, and returns it, for the cases that do *not*
+// involve NaN.
+//  - On entry to this macro, the condition codes LO, EQ and HI indicate that
+//    op0 < op1, op0 == op1 or op0 > op1 respectively.
+//  - For functions that return a result in the flags, this macro can just
+//    return immediately, because those are the correct flags to return anyway.
+//  - Functions that return a boolean in r0 should set it up by checking the
+//    flags.
+//
+// LOCAL_LABEL(NaN): a label defined within the compare function, after the
+// #include of this header. Called when at least one input is a NaN, and sets
+// up the appropriate return value for that case.
+
+// --------------------------------------------------
+// The actual entry point of the compare function.
+//
+// The basic plan is to start by ORing together the two inputs. This tells us
+// two things:
+//  - the top bit of the output tells us whether both inputs are positive, or
+//    whether at least one is negative
+//  - if the 8 exponent bits of the output are not all 1, then there are
+//    definitely no NaNs, so a fast path can handle most non-NaN cases.
+
+// clang-format off
+
+  // Set up the constant 1 << 23 in a register, which we'll need on all
+  // branches.
+  movs    r3, #1
+  lsls    r3, r3, #23
+
+  // Diverge control for the negative-numbers case.
+  movs    r2, op0
+  orrs    r2, r2, op1
+  bmi     LOCAL_LABEL(negative)         // high bit set => at least one negative input
+
+  // Here, both inputs are positive. Try adding 1<<23 to their bitwise OR in
+  // r2. This will carry all the way into the top bit, setting the N flag, if
+  // all 8 exponent bits were set.
+  cmn     r2, r3
+  bmi     LOCAL_LABEL(NaNInf_check_positive) // need to look harder for NaNs
+
+  // The fastest fast path: both inputs positive and we could easily tell there
+  // were no NaNs. So we just compare op0 and op1 as unsigned integers.
+  cmp     op0, op1
+  ReturnResult
+
+LOCAL_LABEL(NaNInf_check_positive):
+  // Second tier for positive numbers. We come here if both inputs are
+  // positive, but our fast initial check didn't manage to rule out a NaN. But
+  // it's not guaranteed that there _is_ a NaN, for two reasons:
+  //
+  //  1. An input with exponent 0xFF might be an infinity instead. Those behave
+  //    normally under comparison.
+  //
+  //  2. There might not even _be_ an input with exponent 0xFF. All we know so
+  //     far is that the two inputs ORed together had all the exponent bits
+  //     set. So each of those bits is set in _at least one_ of the inputs, but
+  //     not necessarily all in the _same_ input.
+  //
+  // Test each exponent individually for 0xFF, using the same CMN idiom as
+  // above. If neither one carries into the sign bit then we have no NaNs _or_
+  // infinities and can compare the registers and return again.
+  cmn     op0, r3
+  bmi     LOCAL_LABEL(NaN_check_positive)
+  cmn     op1, r3
+  bmi     LOCAL_LABEL(NaN_check_positive)
+
+  // Second-tier return path, now we've ruled out anything difficult.
+  cmp     op0, op1
+  ReturnResult
+
+LOCAL_LABEL(NaN_check_positive):
+  // Third tier for positive numbers. Here we know that at least one of the
+  // inputs has exponent 0xFF. But they might still be infinities rather than
+  // NaNs. So now we must check whether there's an actual NaN, by shifting each
+  // input left to get rid of the sign bit, and seeing if the result is
+  // _greater_ than 0xFF000000 (but not equal).
+  //
+  // We could have skipped the second-tier check and done this more rigorous
+  // test immediately. But that would cost an extra instruction in the case
+  // where there are no infinities or NaNs, and we assume that that is so much
+  // more common that it's worth optimizing for.
+  movs    r2, #0xFF
+  lsls    r2, r2, #24
+  lsls    r3, op0, #1
+  cmp     r3, r2
+  bhi     LOCAL_LABEL(NaN)
+  lsls    r3, op1, #1
+  cmp     r3, r2
+  bhi     LOCAL_LABEL(NaN)
+
+  // Now we've finally ruled out NaNs! And we still know both inputs are
+  // positive. So the third-tier return path can just compare the numbers
+  // again.
+  cmp     op0, op1
+  ReturnResult
+
+LOCAL_LABEL(negative):
+  // We come here if at least one operand is negative. We haven't checked for
+  // NaNs at all yet (the sign check came first), so repeat the first-tier
+  // check strategy of seeing if all exponent bits are set in r12.
+  //
+  // On this path, the sign bit in r12 is set, so if adding 1 to the low
+  // exponent bit carries all the way through into the sign bit, it will
+  // _clear_ the sign bit rather than setting it. So we expect MI to be the
+  // "definitely no NaNs" result, where it was PL on the positive branch.
+  cmn     r2, r3
+  bpl     LOCAL_LABEL(NaNInf_check_negative)
+
+  // Now we have no NaNs, but at least one negative number. This gives us two
+  // complications:
+  //
+  //  1. Floating-point numbers are sign/magnitude, not two's complement, so we
+  //     have to consider separately the cases of "both negative" and "one of
+  //     each sign".
+  //
+  //  2. -0 and +0 are required to compare equal.
+  //
+  // But problem #1 is not as hard as it sounds! If both operands are negative,
+  // then we can get the result we want by comparing them as unsigned integers
+  // the opposite way round, because the input with the smaller value (as an
+  // integer) is the larger number in an FP ordering sense. And if one operand
+  // is negative and the other is positive, the _same_ reversed comparison
+  // works, because the positive number (with zero sign bit) will always
+  // compare less than the negative one in an unsigned-integers sense.
+  //
+  // So we only have to worry about problem #2, signed zeroes. This only
+  // affects the answer if _both_ operands are zero. And we can check that
+  // easily, because it happens if and only if r12 = 0x80000000. (We know r12
+  // has its sign bit set; if it has no other bits set, that's because both
+  // inputs were either 0x80000000 or 0x00000000.)
+  lsls    r2, r2, #1              // EQ if both inputs are zero (also sets C)
+  beq     1f
+  cmp     op1, op0                // otherwise, compare them backwards
+1:
+  ReturnResult
+
+LOCAL_LABEL(NaNInf_check_negative):
+  // Second tier for negative numbers: we know the OR of the exponents is 0xFF,
+  // but again, we might not have either _actual_ exponent 0xFF, and also, an
+  // exponent 0xFF might be an infinity instead of a NaN.
+  //
+  // On this path we've already branched twice (once for negative numbers and
+  // once for the first-tier NaN check), so we'll just go straight to the
+  // precise check for NaNs.
+  movs    r2, #0xFF
+  lsls    r2, r2, #24
+  lsls    r3, op0, #1
+  cmp     r3, r2
+  bhi     LOCAL_LABEL(NaN)
+  lsls    r3, op1, #1
+  cmp     r3, r2
+  bhi     LOCAL_LABEL(NaN)
+
+  // Now we've ruled out NaNs, so we can just compare the two input registers
+  // and return. On this path we _don't_ need to check for the special case of
+  // comparing two zeroes, because we only came here if the bitwise OR of the
+  // exponent fields was 0xFF, which means the exponents can't both have been
+  // zero! So we can _just_ do the reversed CMP and finish.
+  cmp     op1, op0
+  ReturnResult
diff --git a/compiler-rt/lib/builtins/arm/thumb1/gesf2.S b/compiler-rt/lib/builtins/arm/thumb1/gesf2.S
new file mode 100644
index 0000000000000..3830b6cb21c29
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/gesf2.S
@@ -0,0 +1,54 @@
+//===-- gesf2.S - single-precision floating point comparison --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This function has the semantics of GNU __cmpsf2, except for its NaN
+// handling. It's a three-way compare which returns <0 if x<y, 0 if x==y, and
+// >0 if x>y. If the result is unordered (i.e. x or y or both is NaN) then it
+// returns <0, where __cmpsf2 would return >0.
+//
+// This also makes it suitable for use as __gtsf2 or __gesf2 (or __eqsf2 or
+// __nesf2).
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+op0 .req r0
+op1 .req r1
+.macro ReturnResult
+  bhi 0f
+  blo 1f
+  movs r0, #0
+  bx lr
+0:
+  movs r0, #1
+  bx lr
+1:
+  movs r0, #1
+  rsbs r0, r0, #0
+  bx lr
+.endm
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gesf2, __compiler_rt_softfp_gesf2)
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__gtsf2, __gesf2)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__compiler_rt_softfp_gesf2)
+  #include "fcmp.h"
+
+LOCAL_LABEL(NaN):
+  movs r0, #1
+  rsbs r0, r0, #0
+  bx lr
+
+END_COMPILERRT_FUNCTION(__compiler_rt_softfp_gesf2)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/thumb1/unordsf2.S b/compiler-rt/lib/builtins/arm/thumb1/unordsf2.S
new file mode 100644
index 0000000000000..5d74e0fdfe159
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/thumb1/unordsf2.S
@@ -0,0 +1,49 @@
+//===-- unordsf2.S - single-precision floating point comparison -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Return 1 if the result of comparing x with y is 'unordered', i.e.
+// one of x and y is NaN.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../../assembly.h"
+
+  .syntax unified
+  .text
+  .p2align 2
+
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__unordsf2, __aeabi_fcmpun)
+
+DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_fcmpun)
+
+  // This function isn't based on the general-purpose code in fcmp.h, because
+  // it's more effort than needed. Here we just need to identify whether or not
+  // there's at least one NaN in the inputs. There's no need to vary that check
+  // based on the sign bit, so we might as well just do the NaN test as quickly
+  // as possible.
+  movs    r2, #0xFF
+  lsls    r2, r2, #24
+  lsls    r3, r0, #1
+  cmp     r3, r2
+  bhi     LOCAL_LABEL(NaN)
+  lsls    r3, r1, #1
+  cmp     r3, r2
+  bhi     LOCAL_LABEL(NaN)
+
+  // If HS, then we have no NaNs and return false.
+  movs    r0, #0
+  bx      lr
+
+  // Otherwise, we have at least one NaN, and return true.
+LOCAL_LABEL(NaN):
+  movs    r0, #1
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_fcmpun)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/lib/builtins/arm/unordsf2.S b/compiler-rt/lib/builtins/arm/unordsf2.S
new file mode 100644
index 0000000000000..1930996779888
--- /dev/null
+++ b/compiler-rt/lib/builtins/arm/unordsf2.S
@@ -0,0 +1,56 @@
+//===-- unordsf2.S - single-precision floating point comparison -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Return 1 if the result of comparing x with y is 'unordered', i.e.
+// one of x and y is NaN.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../assembly.h"
+
+
+  .syntax unified
+  .text
+  .p2align 2
+
+#if __ARM_PCS_VFP
+DEFINE_COMPILERRT_FUNCTION(__unordsf2)
+  push {r4, lr}
+  vmov r0, s0
+  vmov r1, s1
+  bl __aeabi_fcmpun
+  pop {r4, pc}
+#else
+DEFINE_COMPILERRT_FUNCTION_ALIAS(__unordsf2, __aeabi_fcmpun)
+#endif
+
+DEFINE_COMPILERRT_FUNCTION(__aeabi_fcmpun)
+
+  // This function isn't based on the general-purpose code in fcmp.h, because
+  // it's more effort than needed. Here we just need to identify whether or not
+  // there's at least one NaN in the inputs. There's no need to vary that check
+  // based on the sign bit, so we might as well just do the NaN test as quickly
+  // as possible.
+  mov     r12, #0xFF << 24
+  cmp     r12, r0, lsl #1    // if LO, then r12 < (r0 << 1), so r0 is a NaN
+  cmphs   r12, r1, lsl #1    // if not LO, then do the same check for r1
+
+  // If HS, then we have no NaNs and return false. We do this as quickly as we
+  // can (not stopping to take two instructions setting up r0 for both
+  // possibilities), on the assumption that NaNs are rare and we want to
+  // optimize for the non-NaN path.
+  movhs   r0, #0
+  bxhs    lr
+
+  // Otherwise, we have at least one NaN, and return true.
+  mov     r0, #1
+  bx      lr
+
+END_COMPILERRT_FUNCTION(__aeabi_fcmpun)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/compiler-rt/test/builtins/Unit/comparesf2new_test.c b/compiler-rt/test/builtins/Unit/comparesf2new_test.c
new file mode 100644
index 0000000000000..b5dfe2352958f
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/comparesf2new_test.c
@@ -0,0 +1,443 @@
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// RUN: %clang_builtins %s %librt -o %t && %run %t
+// REQUIRES: librt_has_comparesf2
+
+#include "int_lib.h"
+#include <inttypes.h>
+#include <stdio.h>
+
+#include "fp_test.h"
+
+COMPILER_RT_ABI int __eqsf2(float, float);
+COMPILER_RT_ABI int __nesf2(float, float);
+COMPILER_RT_ABI int __gesf2(float, float);
+COMPILER_RT_ABI int __gtsf2(float, float);
+COMPILER_RT_ABI int __lesf2(float, float);
+COMPILER_RT_ABI int __ltsf2(float, float);
+COMPILER_RT_ABI int __cmpsf2(float, float);
+COMPILER_RT_ABI int __unordsf2(float, float);
+
+enum Result { RESULT_LT, RESULT_GT, RESULT_EQ, RESULT_UN };
+
+int expect(uint32_t a_rep, uint32_t b_rep, const char *name, int result, int ok,
+           const char *expected, int line) {
+  if (!ok)
+    printf("error at line %d: %s(%08" PRIx32 ", %08" PRIx32
+           ") = %d, expected %s\n",
+           line, name, a_rep, b_rep, result, expected);
+  return !ok;
+}
+
+int test__comparesf2(uint32_t a_rep, uint32_t b_rep, enum Result result,
+                     int line) {
+  float a = fromRep32(a_rep), b = fromRep32(b_rep);
+
+  int eq = __eqsf2(a, b);
+  int ne = __nesf2(a, b);
+  int ge = __gesf2(a, b);
+  int gt = __gtsf2(a, b);
+  int le = __lesf2(a, b);
+  int lt = __ltsf2(a, b);
+#ifdef __ELF__
+  // The generic builtins/comparedf2.c does not define this function
+  // for object formats other than ELF
+  int cmp = __cmpsf2(a, b);
+#endif
+  int unord = __unordsf2(a, b);
+
+  int ret = 0;
+
+  switch (result) {
+  case RESULT_LT:
+    ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__nesf2", ne, ne != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__gesf2", ge, ge < 0, "< 0", line);
+    ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__lesf2", le, le <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt < 0, "< 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == -1, "== -1", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 0, "== 0", line);
+    break;
+  case RESULT_GT:
+    ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__nesf2", ne, ne != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__gesf2", ge, ge >= 0, ">= 0", line);
+    ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt > 0, "> 0", line);
+    ret |= expect(a_rep, b_rep, "__lesf2", le, le > 0, "> 0", line);
+    ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt >= 0, ">= 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == 1, "== 1", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 0, "== 0", line);
+    break;
+  case RESULT_EQ:
+    ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq == 0, "== 0", line);
+    ret |= expect(a_rep, b_rep, "__nesf2", ne, ne == 0, "== 0", line);
+    ret |= expect(a_rep, b_rep, "__gesf2", ge, ge >= 0, ">= 0", line);
+    ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__lesf2", le, le <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt >= 0, ">= 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == 0, "== 0", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 0, "== 0", line);
+    break;
+  case RESULT_UN:
+    ret |= expect(a_rep, b_rep, "__eqsf2", eq, eq != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__nesf2", ne, ne != 0, "!= 0", line);
+    ret |= expect(a_rep, b_rep, "__gesf2", ge, ge < 0, "< 0", line);
+    ret |= expect(a_rep, b_rep, "__gtsf2", gt, gt <= 0, "<= 0", line);
+    ret |= expect(a_rep, b_rep, "__lesf2", le, le > 0, "> 0", line);
+    ret |= expect(a_rep, b_rep, "__ltsf2", lt, lt >= 0, ">= 0", line);
+#ifdef __ELF__
+    ret |= expect(a_rep, b_rep, "__cmpsf2", cmp, cmp == 1, "== 1", line);
+#endif
+    ret |= expect(a_rep, b_rep, "__unordsf2", unord, unord == 1, "== 1", line);
+    break;
+  }
+
+  return ret;
+}
+
+#define test__comparesf2(a, b, x) test__comparesf2(a, b, x, __LINE__)
+
+int main(void) {
+  int status = 0;
+
+  status |= test__comparesf2(0x00000000, 0x00000001, RESULT_LT);
+  status |= test__comparesf2(0x00000000, 0x007fffff, RESULT_LT);
+  status |= test__comparesf2(0x00000000, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0x00000000, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x00000000, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0x00000000, 0x7f872da0, RESULT_UN);
+  status |= test__comparesf2(0x00000000, 0x7fe42e09, RESULT_UN);
+  status |= test__comparesf2(0x00000000, 0x80000000, RESULT_EQ);
+  status |= test__comparesf2(0x00000000, 0x80000001, RESULT_GT);
+  status |= test__comparesf2(0x00000000, 0x807fffff, RESULT_GT);
+  status |= test__comparesf2(0x00000000, 0x80800000, RESULT_GT);
+  status |= test__comparesf2(0x00000000, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0x00000001, RESULT_EQ);
+  status |= test__comparesf2(0x00000001, 0x3f7fffff, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x3ffffffe, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x3fffffff, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x7effffff, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x7f7ffffe, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x7f7fffff, RESULT_LT);
+  status |= test__comparesf2(0x00000001, 0x7f94d5b9, RESULT_UN);
+  status |= test__comparesf2(0x00000001, 0x7fef53b1, RESULT_UN);
+  status |= test__comparesf2(0x00000001, 0x80000001, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xbf7fffff, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xbffffffe, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xbfffffff, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xfeffffff, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xff7ffffe, RESULT_GT);
+  status |= test__comparesf2(0x00000001, 0xff7fffff, RESULT_GT);
+  status |= test__comparesf2(0x00000002, 0x00000001, RESULT_GT);
+  status |= test__comparesf2(0x00000003, 0x00000002, RESULT_GT);
+  status |= test__comparesf2(0x00000003, 0x40400000, RESULT_LT);
+  status |= test__comparesf2(0x00000003, 0x40a00000, RESULT_LT);
+  status |= test__comparesf2(0x00000003, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x00000003, 0xc0a00000, RESULT_GT);
+  status |= test__comparesf2(0x00000003, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x00000004, 0x00000004, RESULT_EQ);
+  status |= test__comparesf2(0x007ffffc, 0x807ffffc, RESULT_GT);
+  status |= test__comparesf2(0x007ffffd, 0x007ffffe, RESULT_LT);
+  status |= test__comparesf2(0x007fffff, 0x00000000, RESULT_GT);
+  status |= test__comparesf2(0x007fffff, 0x007ffffe, RESULT_GT);
+  status |= test__comparesf2(0x007fffff, 0x007fffff, RESULT_EQ);
+  status |= test__comparesf2(0x007fffff, 0x00800000, RESULT_LT);
+  status |= test__comparesf2(0x007fffff, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0x007fffff, 0x7fa111d3, RESULT_UN);
+  status |= test__comparesf2(0x007fffff, 0x7ff43134, RESULT_UN);
+  status |= test__comparesf2(0x007fffff, 0x80000000, RESULT_GT);
+  status |= test__comparesf2(0x007fffff, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0x00800000, 0x00000000, RESULT_GT);
+  status |= test__comparesf2(0x00800000, 0x00800000, RESULT_EQ);
+  status |= test__comparesf2(0x00800000, 0x80800000, RESULT_GT);
+  status |= test__comparesf2(0x00800001, 0x00800000, RESULT_GT);
+  status |= test__comparesf2(0x00800001, 0x00800002, RESULT_LT);
+  status |= test__comparesf2(0x00ffffff, 0x01000000, RESULT_LT);
+  status |= test__comparesf2(0x00ffffff, 0x01000002, RESULT_LT);
+  status |= test__comparesf2(0x00ffffff, 0x01000004, RESULT_LT);
+  status |= test__comparesf2(0x01000000, 0x00ffffff, RESULT_GT);
+  status |= test__comparesf2(0x01000001, 0x00800001, RESULT_GT);
+  status |= test__comparesf2(0x01000001, 0x00ffffff, RESULT_GT);
+  status |= test__comparesf2(0x01000002, 0x00800001, RESULT_GT);
+  status |= test__comparesf2(0x017fffff, 0x01800000, RESULT_LT);
+  status |= test__comparesf2(0x01800000, 0x017fffff, RESULT_GT);
+  status |= test__comparesf2(0x01800001, 0x017fffff, RESULT_GT);
+  status |= test__comparesf2(0x01800002, 0x01000003, RESULT_GT);
+  status |= test__comparesf2(0x3f000000, 0x3f000000, RESULT_EQ);
+  status |= test__comparesf2(0x3f7fffff, 0x00000001, RESULT_GT);
+  status |= test__comparesf2(0x3f7fffff, 0x80000001, RESULT_GT);
+  status |= test__comparesf2(0x3f800000, 0x3f800000, RESULT_EQ);
+  status |= test__comparesf2(0x3f800000, 0x3f800003, RESULT_LT);
+  status |= test__comparesf2(0x3f800000, 0x40000000, RESULT_LT);
+  status |= test__comparesf2(0x3f800000, 0x40e00000, RESULT_LT);
+  status |= test__comparesf2(0x3f800000, 0x7fb27f62, RESULT_UN);
+  status |= test__comparesf2(0x3f800000, 0x7fd9d4b4, RESULT_UN);
+  status |= test__comparesf2(0x3f800000, 0x80000000, RESULT_GT);
+  status |= test__comparesf2(0x3f800000, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x3f800000, 0xbf800003, RESULT_GT);
+  status |= test__comparesf2(0x3f800001, 0x3f800000, RESULT_GT);
+  status |= test__comparesf2(0x3f800001, 0x3f800002, RESULT_LT);
+  status |= test__comparesf2(0x3f800001, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x3ffffffc, 0x3ffffffd, RESULT_LT);
+  status |= test__comparesf2(0x3fffffff, 0x00000001, RESULT_GT);
+  status |= test__comparesf2(0x3fffffff, 0x40000000, RESULT_LT);
+  status |= test__comparesf2(0x40000000, 0x3f800000, RESULT_GT);
+  status |= test__comparesf2(0x40000000, 0x3fffffff, RESULT_GT);
+  status |= test__comparesf2(0x40000000, 0x40000000, RESULT_EQ);
+  status |= test__comparesf2(0x40000000, 0x40000001, RESULT_LT);
+  status |= test__comparesf2(0x40000000, 0xc0000000, RESULT_GT);
+  status |= test__comparesf2(0x40000000, 0xc0000001, RESULT_GT);
+  status |= test__comparesf2(0x40000000, 0xc0a00000, RESULT_GT);
+  status |= test__comparesf2(0x40000001, 0x3f800001, RESULT_GT);
+  status |= test__comparesf2(0x40000001, 0x40000002, RESULT_LT);
+  status |= test__comparesf2(0x40000001, 0xc0000002, RESULT_GT);
+  status |= test__comparesf2(0x40000002, 0x3f800001, RESULT_GT);
+  status |= test__comparesf2(0x40000002, 0x3f800003, RESULT_GT);
+  status |= test__comparesf2(0x40000004, 0x40000003, RESULT_GT);
+  status |= test__comparesf2(0x40400000, 0x40400000, RESULT_EQ);
+  status |= test__comparesf2(0x407fffff, 0x407ffffe, RESULT_GT);
+  status |= test__comparesf2(0x407fffff, 0x40800002, RESULT_LT);
+  status |= test__comparesf2(0x40800001, 0x407fffff, RESULT_GT);
+  status |= test__comparesf2(0x40a00000, 0x00000000, RESULT_GT);
+  status |= test__comparesf2(0x40a00000, 0x80000000, RESULT_GT);
+  status |= test__comparesf2(0x40a00000, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x40a00000, 0xc0a00000, RESULT_GT);
+  status |= test__comparesf2(0x7d800001, 0x7d7fffff, RESULT_GT);
+  status |= test__comparesf2(0x7e7fffff, 0x7e7ffffe, RESULT_GT);
+  status |= test__comparesf2(0x7e7fffff, 0x7e800002, RESULT_LT);
+  status |= test__comparesf2(0x7e800000, 0x7e7fffff, RESULT_GT);
+  status |= test__comparesf2(0x7e800000, 0x7e800000, RESULT_EQ);
+  status |= test__comparesf2(0x7e800000, 0x7e800001, RESULT_LT);
+  status |= test__comparesf2(0x7e800001, 0x7e800000, RESULT_GT);
+  status |= test__comparesf2(0x7e800001, 0x7f000001, RESULT_LT);
+  status |= test__comparesf2(0x7e800001, 0xfe800000, RESULT_GT);
+  status |= test__comparesf2(0x7e800002, 0x7e000003, RESULT_GT);
+  status |= test__comparesf2(0x7e800004, 0x7e800003, RESULT_GT);
+  status |= test__comparesf2(0x7efffffe, 0x7efffffe, RESULT_EQ);
+  status |= test__comparesf2(0x7efffffe, 0x7effffff, RESULT_LT);
+  status |= test__comparesf2(0x7efffffe, 0xfeffffff, RESULT_GT);
+  status |= test__comparesf2(0x7effffff, 0x3f800000, RESULT_GT);
+  status |= test__comparesf2(0x7effffff, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x7effffff, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x7effffff, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x7f000000, 0x3f800000, RESULT_GT);
+  status |= test__comparesf2(0x7f000000, 0x7f000000, RESULT_EQ);
+  status |= test__comparesf2(0x7f000000, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0x7f000000, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x7f000000, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x7f000000, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0x7f000001, 0x7f000000, RESULT_GT);
+  status |= test__comparesf2(0x7f000001, 0x7f000002, RESULT_LT);
+  status |= test__comparesf2(0x7f000001, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x7f000002, 0x7e800001, RESULT_GT);
+  status |= test__comparesf2(0x7f7ffffe, 0x3f800000, RESULT_GT);
+  status |= test__comparesf2(0x7f7ffffe, 0x7f7fffff, RESULT_LT);
+  status |= test__comparesf2(0x7f7ffffe, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x7f7ffffe, 0xff7fffff, RESULT_GT);
+  status |= test__comparesf2(0x7f7fffff, 0x00000001, RESULT_GT);
+  status |= test__comparesf2(0x7f7fffff, 0x3f800000, RESULT_GT);
+  status |= test__comparesf2(0x7f7fffff, 0x7f7fffff, RESULT_EQ);
+  status |= test__comparesf2(0x7f7fffff, 0x7fbed1eb, RESULT_UN);
+  status |= test__comparesf2(0x7f7fffff, 0x7fe15ee3, RESULT_UN);
+  status |= test__comparesf2(0x7f7fffff, 0x80000001, RESULT_GT);
+  status |= test__comparesf2(0x7f7fffff, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x00000000, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x00000001, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x007fffff, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x7f000000, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x7f7fffff, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x7f800000, RESULT_EQ);
+  status |= test__comparesf2(0x7f800000, 0x7f91a4da, RESULT_UN);
+  status |= test__comparesf2(0x7f800000, 0x7fd44a09, RESULT_UN);
+  status |= test__comparesf2(0x7f800000, 0x80000000, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x80000001, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0x807fffff, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0xff7fffff, RESULT_GT);
+  status |= test__comparesf2(0x7f800000, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0x7f86d066, 0x00000000, RESULT_UN);
+  status |= test__comparesf2(0x7f85a878, 0x00000001, RESULT_UN);
+  status |= test__comparesf2(0x7f8c0dca, 0x007fffff, RESULT_UN);
+  status |= test__comparesf2(0x7f822725, 0x3f800000, RESULT_UN);
+  status |= test__comparesf2(0x7f853870, 0x7f7fffff, RESULT_UN);
+  status |= test__comparesf2(0x7fbefc9d, 0x7f800000, RESULT_UN);
+  status |= test__comparesf2(0x7f9f84a9, 0x7f81461b, RESULT_UN);
+  status |= test__comparesf2(0x7f9e2c1d, 0x7fe4a313, RESULT_UN);
+  status |= test__comparesf2(0x7fb0e6d0, 0x80000000, RESULT_UN);
+  status |= test__comparesf2(0x7fac9171, 0x80000001, RESULT_UN);
+  status |= test__comparesf2(0x7f824ae6, 0x807fffff, RESULT_UN);
+  status |= test__comparesf2(0x7fa8b9a0, 0xbf800000, RESULT_UN);
+  status |= test__comparesf2(0x7f92a1cd, 0xff7fffff, RESULT_UN);
+  status |= test__comparesf2(0x7fbe5d29, 0xff800000, RESULT_UN);
+  status |= test__comparesf2(0x7fcc9a57, 0x00000000, RESULT_UN);
+  status |= test__comparesf2(0x7fec9d71, 0x00000001, RESULT_UN);
+  status |= test__comparesf2(0x7fd5db76, 0x007fffff, RESULT_UN);
+  status |= test__comparesf2(0x7fd003d9, 0x3f800000, RESULT_UN);
+  status |= test__comparesf2(0x7fca0684, 0x7f7fffff, RESULT_UN);
+  status |= test__comparesf2(0x7fc46aa0, 0x7f800000, RESULT_UN);
+  status |= test__comparesf2(0x7ff72b19, 0x7faee637, RESULT_UN);
+  status |= test__comparesf2(0x7fe9e0c1, 0x7fcc2788, RESULT_UN);
+  status |= test__comparesf2(0x7fc571ea, 0x80000000, RESULT_UN);
+  status |= test__comparesf2(0x7fd81a54, 0x80000001, RESULT_UN);
+  status |= test__comparesf2(0x7febdfaf, 0x807fffff, RESULT_UN);
+  status |= test__comparesf2(0x7ffa1f94, 0xbf800000, RESULT_UN);
+  status |= test__comparesf2(0x7ff38fa0, 0xff7fffff, RESULT_UN);
+  status |= test__comparesf2(0x7fdf3502, 0xff800000, RESULT_UN);
+  status |= test__comparesf2(0x80000000, 0x00000000, RESULT_EQ);
+  status |= test__comparesf2(0x80000000, 0x00000001, RESULT_LT);
+  status |= test__comparesf2(0x80000000, 0x007fffff, RESULT_LT);
+  status |= test__comparesf2(0x80000000, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x80000000, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0x80000000, 0x7fbdfb72, RESULT_UN);
+  status |= test__comparesf2(0x80000000, 0x7fdd528e, RESULT_UN);
+  status |= test__comparesf2(0x80000000, 0x80000001, RESULT_GT);
+  status |= test__comparesf2(0x80000000, 0x807fffff, RESULT_GT);
+  status |= test__comparesf2(0x80000000, 0x80800000, RESULT_GT);
+  status |= test__comparesf2(0x80000000, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x80000000, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0x00000001, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x3f7fffff, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x3ffffffe, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x3fffffff, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x7effffff, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x7f7ffffe, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x7f7fffff, RESULT_LT);
+  status |= test__comparesf2(0x80000001, 0x7fac481a, RESULT_UN);
+  status |= test__comparesf2(0x80000001, 0x7fcf111d, RESULT_UN);
+  status |= test__comparesf2(0x80000001, 0x80000001, RESULT_EQ);
+  status |= test__comparesf2(0x80000001, 0xbf7fffff, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xbf800000, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xbffffffe, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xbfffffff, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xfeffffff, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xff7ffffe, RESULT_GT);
+  status |= test__comparesf2(0x80000001, 0xff7fffff, RESULT_GT);
+  status |= test__comparesf2(0x80000002, 0x80000001, RESULT_LT);
+  status |= test__comparesf2(0x80000003, 0x40400000, RESULT_LT);
+  status |= test__comparesf2(0x80000003, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0x80000003, 0x80000002, RESULT_LT);
+  status |= test__comparesf2(0x80000003, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0x80000004, 0x80000004, RESULT_EQ);
+  status |= test__comparesf2(0x807ffffd, 0x807ffffe, RESULT_GT);
+  status |= test__comparesf2(0x807fffff, 0x00000000, RESULT_LT);
+  status |= test__comparesf2(0x807fffff, 0x007fffff, RESULT_LT);
+  status |= test__comparesf2(0x807fffff, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0x807fffff, 0x7faf07f6, RESULT_UN);
+  status |= test__comparesf2(0x807fffff, 0x7fd18a54, RESULT_UN);
+  status |= test__comparesf2(0x807fffff, 0x80000000, RESULT_LT);
+  status |= test__comparesf2(0x807fffff, 0x807ffffe, RESULT_LT);
+  status |= test__comparesf2(0x807fffff, 0x807fffff, RESULT_EQ);
+  status |= test__comparesf2(0x807fffff, 0x80800000, RESULT_GT);
+  status |= test__comparesf2(0x807fffff, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0x80800000, 0x00000000, RESULT_LT);
+  status |= test__comparesf2(0x80800000, 0x00800000, RESULT_LT);
+  status |= test__comparesf2(0x80800001, 0x80800000, RESULT_LT);
+  status |= test__comparesf2(0x80800001, 0x80800002, RESULT_GT);
+  status |= test__comparesf2(0x80ffffff, 0x81000000, RESULT_GT);
+  status |= test__comparesf2(0x80ffffff, 0x81000002, RESULT_GT);
+  status |= test__comparesf2(0x80ffffff, 0x81000004, RESULT_GT);
+  status |= test__comparesf2(0x81000000, 0x80ffffff, RESULT_LT);
+  status |= test__comparesf2(0x81000001, 0x80800001, RESULT_LT);
+  status |= test__comparesf2(0x81000001, 0x80ffffff, RESULT_LT);
+  status |= test__comparesf2(0x81000002, 0x80800001, RESULT_LT);
+  status |= test__comparesf2(0x817fffff, 0x81800000, RESULT_GT);
+  status |= test__comparesf2(0x81800000, 0x817fffff, RESULT_LT);
+  status |= test__comparesf2(0x81800001, 0x817fffff, RESULT_LT);
+  status |= test__comparesf2(0x81800002, 0x81000003, RESULT_LT);
+  status |= test__comparesf2(0xbf800000, 0x3f800003, RESULT_LT);
+  status |= test__comparesf2(0xbf800000, 0x7fa66ee9, RESULT_UN);
+  status |= test__comparesf2(0xbf800000, 0x7fe481ef, RESULT_UN);
+  status |= test__comparesf2(0xbf800000, 0x80000000, RESULT_LT);
+  status |= test__comparesf2(0xbf800000, 0xbf800003, RESULT_GT);
+  status |= test__comparesf2(0xbf800001, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0xbf800001, 0xbf800000, RESULT_LT);
+  status |= test__comparesf2(0xbf800001, 0xbf800002, RESULT_GT);
+  status |= test__comparesf2(0xbffffffc, 0xbffffffd, RESULT_GT);
+  status |= test__comparesf2(0xbfffffff, 0x00000001, RESULT_LT);
+  status |= test__comparesf2(0xbfffffff, 0xc0000000, RESULT_GT);
+  status |= test__comparesf2(0xc0000000, 0x40000001, RESULT_LT);
+  status |= test__comparesf2(0xc0000000, 0xbfffffff, RESULT_LT);
+  status |= test__comparesf2(0xc0000000, 0xc0000001, RESULT_GT);
+  status |= test__comparesf2(0xc0000001, 0x40000002, RESULT_LT);
+  status |= test__comparesf2(0xc0000001, 0xbf800001, RESULT_LT);
+  status |= test__comparesf2(0xc0000001, 0xc0000002, RESULT_GT);
+  status |= test__comparesf2(0xc0000002, 0xbf800001, RESULT_LT);
+  status |= test__comparesf2(0xc0000002, 0xbf800003, RESULT_LT);
+  status |= test__comparesf2(0xc0000004, 0xc0000003, RESULT_LT);
+  status |= test__comparesf2(0xc0400000, 0x40400000, RESULT_LT);
+  status |= test__comparesf2(0xc07fffff, 0xc07ffffe, RESULT_LT);
+  status |= test__comparesf2(0xc07fffff, 0xc0800002, RESULT_GT);
+  status |= test__comparesf2(0xc0800001, 0xc07fffff, RESULT_LT);
+  status |= test__comparesf2(0xfd800001, 0xfd7fffff, RESULT_LT);
+  status |= test__comparesf2(0xfe7fffff, 0xfe7ffffe, RESULT_LT);
+  status |= test__comparesf2(0xfe7fffff, 0xfe800002, RESULT_GT);
+  status |= test__comparesf2(0xfe800000, 0xfe7fffff, RESULT_LT);
+  status |= test__comparesf2(0xfe800000, 0xfe800001, RESULT_GT);
+  status |= test__comparesf2(0xfe800001, 0x7e800000, RESULT_LT);
+  status |= test__comparesf2(0xfe800001, 0xfe800000, RESULT_LT);
+  status |= test__comparesf2(0xfe800001, 0xff000001, RESULT_GT);
+  status |= test__comparesf2(0xfe800002, 0xfe000003, RESULT_LT);
+  status |= test__comparesf2(0xfe800004, 0xfe800003, RESULT_LT);
+  status |= test__comparesf2(0xfefffffe, 0x7efffffe, RESULT_LT);
+  status |= test__comparesf2(0xfefffffe, 0x7effffff, RESULT_LT);
+  status |= test__comparesf2(0xfefffffe, 0xfefffffe, RESULT_EQ);
+  status |= test__comparesf2(0xfefffffe, 0xfeffffff, RESULT_GT);
+  status |= test__comparesf2(0xfeffffff, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0xfeffffff, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0xfeffffff, 0xbf800000, RESULT_LT);
+  status |= test__comparesf2(0xfeffffff, 0xff000000, RESULT_GT);
+  status |= test__comparesf2(0xff000000, 0x00000000, RESULT_LT);
+  status |= test__comparesf2(0xff000000, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0xff000000, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0xff000000, 0x80000000, RESULT_LT);
+  status |= test__comparesf2(0xff000000, 0xbf800000, RESULT_LT);
+  status |= test__comparesf2(0xff000000, 0xff000000, RESULT_EQ);
+  status |= test__comparesf2(0xff000000, 0xff800000, RESULT_GT);
+  status |= test__comparesf2(0xff000001, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0xff000001, 0xff000000, RESULT_LT);
+  status |= test__comparesf2(0xff000001, 0xff000002, RESULT_GT);
+  status |= test__comparesf2(0xff000002, 0xfe800001, RESULT_LT);
+  status |= test__comparesf2(0xff7ffffe, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0xff7ffffe, 0x7f7fffff, RESULT_LT);
+  status |= test__comparesf2(0xff7ffffe, 0xbf800000, RESULT_LT);
+  status |= test__comparesf2(0xff7ffffe, 0xff7fffff, RESULT_GT);
+  status |= test__comparesf2(0xff7fffff, 0x00000001, RESULT_LT);
+  status |= test__comparesf2(0xff7fffff, 0x3f800000, RESULT_LT);
+  status |= test__comparesf2(0xff7fffff, 0x7f919cff, RESULT_UN);
+  status |= test__comparesf2(0xff7fffff, 0x7fd729a7, RESULT_UN);
+  status |= test__comparesf2(0xff7fffff, 0x80000001, RESULT_LT);
+  status |= test__comparesf2(0xff7fffff, 0xbf800000, RESULT_LT);
+  status |= test__comparesf2(0xff7fffff, 0xff7fffff, RESULT_EQ);
+  status |= test__comparesf2(0xff800000, 0x00000000, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x00000001, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x007fffff, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x7f000000, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x7f7fffff, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x7f800000, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x7fafdbc1, RESULT_UN);
+  status |= test__comparesf2(0xff800000, 0x7fec80fe, RESULT_UN);
+  status |= test__comparesf2(0xff800000, 0x80000000, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x80000001, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0x807fffff, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0xff000000, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0xff7fffff, RESULT_LT);
+  status |= test__comparesf2(0xff800000, 0xff800000, RESULT_EQ);
+
+  return status;
+}

From a4acc5c28beab95b1ed9bdee3e75441546e5b30c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 14 May 2026 10:56:42 +0100
Subject: [PATCH 51/95] [X86] Improve lowering of i32/i64 minmax reductions
 (#197578)

Allow 32-bit targets to correctly lower i64 ISD::VECREDUCE min/max nodes
via ReplaceNodeResults - this is necessary once we're finally ready for
#194473 and remove combineMinMaxReduction entirely

Improve handling of v2iXX reduction stages by consistently preferring
binop(extract(),extract()) scalarisation on SSE targets (if the vector
binop isn't legal).
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   36 +-
 .../CodeGen/X86/horizontal-reduce-smax.ll     |  359 +++--
 .../CodeGen/X86/horizontal-reduce-smin.ll     |  362 +++--
 .../CodeGen/X86/horizontal-reduce-umax.ll     |  454 +++---
 .../CodeGen/X86/horizontal-reduce-umin.ll     |  503 +++----
 llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll  |   18 +-
 .../CodeGen/X86/vector-extract-last-active.ll |    9 +-
 llvm/test/CodeGen/X86/vector-reduce-smax.ll   |  985 ++++++-------
 llvm/test/CodeGen/X86/vector-reduce-smin.ll   |  975 ++++++-------
 llvm/test/CodeGen/X86/vector-reduce-umax.ll   | 1205 ++++++++--------
 llvm/test/CodeGen/X86/vector-reduce-umin.ll   | 1255 ++++++++---------
 11 files changed, 2825 insertions(+), 3336 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3ca4e85b671cd..f41f4b8784dd0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1170,7 +1170,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // SSE2 can use basic vector unrolling.
     // SSE41 can use PHMINPOS to perform v16i8/v8i16 minmax reductions.
-    for (auto VT : {MVT::v16i8, MVT::v8i16}) {
+    // Fallback to ReplaceNodeResults for vXi64 reductions on 32-bit targets.
+    for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::i64}) {
       setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
@@ -1427,15 +1428,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
     }
 
-    // Allow v4i32/v2i64 minmax reductions with SSE41 vector comparison,
-    // select and minmax handling.
-    for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
-      setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
-      setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
-    }
-
     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
     for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
       setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8,  Legal);
@@ -29692,6 +29684,7 @@ static SDValue LowerVECREDUCE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Src = Op.getOperand(0);
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getScalarType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc DL(Op);
 
   if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
@@ -29709,6 +29702,14 @@ static SDValue LowerVECREDUCE(SDValue Op, const X86Subtarget &Subtarget,
   // Expand 128-bit shuffle tree + reduction binops.
   unsigned NumSrcElts = SrcVT.getVectorNumElements();
   for (unsigned NumElts = NumSrcElts; NumElts != 1; NumElts /= 2) {
+    // Scalarize the last 2 elements if the vector binop isn't legal.
+    if (NumElts == 2 && !Subtarget.hasAVX512() &&
+        !TLI.isOperationLegal(BinOp, SrcVT) && TLI.isTypeLegal(ExtractVT)) {
+      return DAG.getNode(BinOp, DL, ExtractVT,
+                         DAG.getExtractVectorElt(DL, ExtractVT, Src, 0),
+                         DAG.getExtractVectorElt(DL, ExtractVT, Src, 1));
+    }
+
     SmallVector<int, 16> Mask(NumSrcElts, -1);
     std::iota(Mask.begin(), Mask.begin() + (NumElts / 2), NumElts / 2);
     SDValue Upper =
@@ -35346,6 +35347,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
+  case ISD::VECREDUCE_SMAX:
+  case ISD::VECREDUCE_SMIN:
+  case ISD::VECREDUCE_UMAX:
+  case ISD::VECREDUCE_UMIN: {
+    assert(N->getValueType(0) == MVT::i64 && "Unexpected vector reduction");
+    if (SDValue Res = LowerMINMAX_REDUCE(SDValue(N, 0), Subtarget, DAG))
+      Results.push_back(Res);
+    return;
+  }
   case ISD::FP_TO_SINT_SAT:
   case ISD::FP_TO_UINT_SAT: {
     if (!Subtarget.hasAVX10_2())
@@ -47127,10 +47137,6 @@ static SDValue createPSADBW(SelectionDAG &DAG, SDValue N0, SDValue N1,
 // ISD::VECREDUCE_SMIN/SMAX/UMIN/UMAX.
 static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
-  EVT ExtractVT = Extract->getValueType(0);
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
-    return SDValue();
-
   // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
   ISD::NodeType BinOp;
   SDValue Src = DAG.matchBinOpReduction(
@@ -47160,7 +47166,7 @@ static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG,
     llvm_unreachable("Unexpected reduction");
   }
 
-  return DAG.getNode(RdxOp, SDLoc(Extract), ExtractVT, Src);
+  return DAG.getNode(RdxOp, SDLoc(Extract), Extract->getValueType(0), Src);
 }
 
 // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK.
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
index ebb5bc9069890..8dc76308edfc0 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smax.ll
@@ -13,13 +13,13 @@
 ; 128-bit Vectors
 ;
 
-define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -67,19 +67,18 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v2i64:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE42-NEXT:    movq %xmm0, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1OR2-LABEL: test_reduce_v2i64:
 ; X64-AVX1OR2:       ## %bb.0:
-; X64-AVX1OR2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1OR2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX1OR2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1OR2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1OR2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1OR2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1OR2-NEXT:    cmpq %rax, %rcx
+; X64-AVX1OR2-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX1OR2-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: test_reduce_v2i64:
@@ -95,7 +94,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
   ret i64 %4
 }
 
-define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -171,7 +170,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
   ret i32 %7
 }
 
-define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -255,7 +254,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -270,21 +269,18 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -332,14 +328,11 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -396,7 +389,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; 256-bit Vectors
 ;
 
-define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
@@ -415,9 +408,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -503,11 +496,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v4i64:
@@ -515,10 +507,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -527,10 +519,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -553,7 +545,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
   ret i64 %7
 }
 
-define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -686,7 +678,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
   ret i32 %10
 }
 
-define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pmaxsw %xmm1, %xmm0
@@ -810,7 +802,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
   ret i16 %13
 }
 
-define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -830,21 +822,18 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -915,14 +904,11 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1003,13 +989,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; 512-bit Vectors
 ;
 
-define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1019,42 +1005,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
-; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
@@ -1191,11 +1177,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v8i64:
@@ -1208,10 +1193,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1222,10 +1207,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -1253,7 +1238,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
   ret i64 %10
 }
 
-define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1423,7 +1408,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
   ret i32 %13
 }
 
-define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pmaxsw %xmm3, %xmm1
@@ -1568,7 +1553,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
   ret i16 %16
 }
 
-define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1598,21 +1583,18 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1699,14 +1681,11 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1798,7 +1777,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; Partial Vector Reductions
 ;
 
-define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1885,7 +1864,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
   ret i16 %10
 }
 
-define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1972,7 +1951,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1987,21 +1966,18 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2050,14 +2026,11 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2112,7 +2085,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
   ret i8 %13
 }
 
-define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -2127,21 +2100,18 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2190,14 +2160,11 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
index 5e93f93a6d599..197f3ecf2290d 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-smin.ll
@@ -13,13 +13,13 @@
 ; 128-bit Vectors
 ;
 
-define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -68,20 +68,18 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v2i64:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE42-NEXT:    movq %xmm0, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1OR2-LABEL: test_reduce_v2i64:
 ; X64-AVX1OR2:       ## %bb.0:
-; X64-AVX1OR2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1OR2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX1OR2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1OR2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1OR2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1OR2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1OR2-NEXT:    cmpq %rax, %rcx
+; X64-AVX1OR2-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX1OR2-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: test_reduce_v2i64:
@@ -97,7 +95,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
   ret i64 %4
 }
 
-define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -173,7 +171,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
   ret i32 %7
 }
 
-define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -257,7 +255,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -272,21 +270,18 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -334,14 +329,11 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -398,7 +390,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; 256-bit Vectors
 ;
 
-define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
@@ -417,9 +409,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
@@ -507,11 +499,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v4i64:
@@ -519,10 +510,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -531,10 +522,10 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -557,7 +548,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
   ret i64 %7
 }
 
-define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -690,7 +681,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
   ret i32 %10
 }
 
-define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pminsw %xmm1, %xmm0
@@ -814,7 +805,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
   ret i16 %13
 }
 
-define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -834,21 +825,18 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -919,14 +907,11 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1007,13 +992,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; 512-bit Vectors
 ;
 
-define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1023,42 +1008,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
@@ -1195,11 +1180,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v8i64:
@@ -1212,10 +1196,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1226,10 +1210,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -1257,7 +1241,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
   ret i64 %10
 }
 
-define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
@@ -1427,7 +1411,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
   ret i32 %13
 }
 
-define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pminsw %xmm3, %xmm1
@@ -1572,7 +1556,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
   ret i16 %16
 }
 
-define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
@@ -1602,21 +1586,18 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1703,14 +1684,11 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1802,7 +1780,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; Partial Vector Reductions
 ;
 
-define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1889,7 +1867,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
   ret i16 %10
 }
 
-define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1976,7 +1954,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1991,21 +1969,18 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2054,14 +2029,11 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2116,7 +2088,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
   ret i8 %13
 }
 
-define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -2131,21 +2103,18 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2194,14 +2163,11 @@ define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
index aa2b6bacdd902..29fa565023c26 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umax.ll
@@ -13,13 +13,13 @@
 ; 128-bit Vectors
 ;
 
-define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -40,37 +40,37 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v2i64:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v2i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    ## xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_reduce_v2i64:
 ; X86-AVX2:       ## %bb.0:
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    retl
@@ -86,37 +86,26 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v2i64:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm2
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE42-NEXT:    movq %xmm3, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE42-NEXT:    movq %xmm0, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v2i64:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0,0]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_reduce_v2i64:
 ; X64-AVX2:       ## %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: test_reduce_v2i64:
@@ -132,7 +121,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
   ret i64 %4
 }
 
-define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
@@ -214,7 +203,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
   ret i32 %7
 }
 
-define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -223,11 +212,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
-; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -259,11 +247,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
-; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -318,7 +305,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -443,7 +430,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; 256-bit Vectors
 ;
 
-define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
@@ -462,9 +449,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -491,9 +478,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
 ; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
@@ -510,11 +497,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -528,11 +515,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -565,19 +552,16 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v4i64:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v4i64:
@@ -586,15 +570,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX1-NEXT:    ## xmm2 = mem[0,0]
 ; X64-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm4
-; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -603,15 +585,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X64-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -634,7 +614,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
   ret i64 %7
 }
 
-define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
@@ -779,7 +759,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
   ret i32 %10
 }
 
-define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
@@ -790,11 +770,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -845,11 +824,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -917,7 +895,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
   ret i16 %13
 }
 
-define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pmaxub %xmm1, %xmm0
@@ -1076,13 +1054,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; 512-bit Vectors
 ;
 
-define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1092,42 +1070,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
-; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm3
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm3, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm3, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm3
@@ -1149,26 +1127,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm6
 ; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm4
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm4
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; X86-SSE42-NEXT:    movapd %xmm3, %xmm1
 ; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
 ; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
@@ -1178,27 +1157,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-AVX1-LABEL: test_reduce_v8i64:
 ; X86-AVX1:       ## %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1216,11 +1195,11 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1298,17 +1277,14 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm2, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm5
-; X64-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v8i64:
@@ -1329,12 +1305,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
-; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1347,15 +1321,13 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm3
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X64-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -1383,7 +1355,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
   ret i64 %10
 }
 
-define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
@@ -1577,7 +1549,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
   ret i32 %13
 }
 
-define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    psubusw %xmm0, %xmm2
@@ -1592,11 +1564,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1657,11 +1628,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1740,7 +1710,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
   ret i16 %16
 }
 
-define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pmaxub %xmm3, %xmm1
@@ -1920,7 +1890,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; Partial Vector Reductions
 ;
 
-define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1929,11 +1899,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
-; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1966,11 +1935,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
-; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2028,7 +1996,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
   ret i16 %10
 }
 
-define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -2037,11 +2005,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
-; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movd %xmm1, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2074,11 +2041,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
-; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2136,7 +2102,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -2261,7 +2227,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
   ret i8 %13
 }
 
-define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
index a7ab20f246fb5..835b3c86a1a95 100644
--- a/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/horizontal-reduce-umin.ll
@@ -13,13 +13,13 @@
 ; 128-bit Vectors
 ;
 
-define i64 @test_reduce_v2i64(<2 x i64> %a0) {
+define i64 @test_reduce_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v2i64:
 ; X86-SSE2:       ## %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -40,38 +40,38 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ; X86-SSE42-LABEL: test_reduce_v2i64:
 ; X86-SSE42:       ## %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm0, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
 ; X86-AVX1-LABEL: test_reduce_v2i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    ## xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    retl
 ;
 ; X86-AVX2-LABEL: test_reduce_v2i64:
 ; X86-AVX2:       ## %bb.0:
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    retl
@@ -87,38 +87,26 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v2i64:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE42-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE42-NEXT:    movq %xmm3, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE42-NEXT:    movq %xmm0, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v2i64:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0,0]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    retq
 ;
 ; X64-AVX2-LABEL: test_reduce_v2i64:
 ; X64-AVX2:       ## %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    retq
 ;
 ; X64-AVX512-LABEL: test_reduce_v2i64:
@@ -134,7 +122,7 @@ define i64 @test_reduce_v2i64(<2 x i64> %a0) {
   ret i64 %4
 }
 
-define i32 @test_reduce_v4i32(<4 x i32> %a0) {
+define i32 @test_reduce_v4i32(<4 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,2147483648]
@@ -216,7 +204,7 @@ define i32 @test_reduce_v4i32(<4 x i32> %a0) {
   ret i32 %7
 }
 
-define i16 @test_reduce_v8i16(<8 x i16> %a0) {
+define i16 @test_reduce_v8i16(<8 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -227,12 +215,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -260,12 +246,10 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -295,7 +279,7 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v16i8(<16 x i8> %a0) {
+define i8 @test_reduce_v16i8(<16 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -385,7 +369,7 @@ define i8 @test_reduce_v16i8(<16 x i8> %a0) {
 ; 256-bit Vectors
 ;
 
-define i64 @test_reduce_v4i64(<4 x i64> %a0) {
+define i64 @test_reduce_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v4i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
@@ -404,9 +388,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
@@ -434,9 +418,9 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
 ; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm2
 ; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
@@ -454,11 +438,11 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -466,17 +450,17 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ;
 ; X86-AVX2-LABEL: test_reduce_v4i64:
 ; X86-AVX2:       ## %bb.0:
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -509,21 +493,16 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-SSE42-LABEL: test_reduce_v4i64:
 ; X64-SSE42:       ## %bb.0:
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
-; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
-; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v4i64:
@@ -532,15 +511,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX1-NEXT:    ## xmm1 = mem[0,0]
 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm4
-; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -549,15 +526,13 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
 ; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -580,7 +555,7 @@ define i64 @test_reduce_v4i64(<4 x i64> %a0) {
   ret i64 %7
 }
 
-define i32 @test_reduce_v8i32(<8 x i32> %a0) {
+define i32 @test_reduce_v8i32(<8 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
@@ -725,7 +700,7 @@ define i32 @test_reduce_v8i32(<8 x i32> %a0) {
   ret i32 %10
 }
 
-define i16 @test_reduce_v16i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
@@ -739,12 +714,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -789,12 +762,10 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -851,7 +822,7 @@ define i16 @test_reduce_v16i16(<16 x i16> %a0) {
   ret i16 %13
 }
 
-define i8 @test_reduce_v32i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pminub %xmm1, %xmm0
@@ -990,13 +961,13 @@ define i8 @test_reduce_v32i8(<32 x i8> %a0) {
 ; 512-bit Vectors
 ;
 
-define i64 @test_reduce_v8i64(<8 x i64> %a0) {
+define i64 @test_reduce_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v8i64:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
@@ -1006,42 +977,42 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm6
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm6, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE2-NEXT:    movdqa %xmm3, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm3, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm5
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
@@ -1061,32 +1032,32 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-SSE42-LABEL: test_reduce_v8i64:
 ; X86-SSE42:       ## %bb.0:
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm5
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm5
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
 ; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
-; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm5
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; X86-SSE42-NEXT:    movd %xmm1, %eax
 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
@@ -1094,27 +1065,27 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X86-AVX1-LABEL: test_reduce_v8i64:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    ## xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm5, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm5
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -1127,16 +1098,16 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
 ; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
 ; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -1198,34 +1169,30 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ;
 ; X64-SSE42-LABEL: test_reduce_v8i64:
 ; X64-SSE42:       ## %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm5
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm6
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm6
 ; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
-; X64-SSE42-NEXT:    movapd %xmm2, %xmm5
-; X64-SSE42-NEXT:    xorpd %xmm4, %xmm5
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT:    movapd %xmm2, %xmm4
+; X64-SSE42-NEXT:    xorpd %xmm5, %xmm4
 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm6
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm6
 ; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm4, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm3, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm4, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm4
-; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
-; X64-SSE42-NEXT:    movdqa %xmm4, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X64-AVX1-LABEL: test_reduce_v8i64:
@@ -1246,12 +1213,10 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1264,15 +1229,13 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
 ; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -1300,7 +1263,7 @@ define i64 @test_reduce_v8i64(<8 x i64> %a0) {
   ret i64 %10
 }
 
-define i32 @test_reduce_v16i32(<16 x i32> %a0) {
+define i32 @test_reduce_v16i32(<16 x i32> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i32:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
@@ -1494,7 +1457,7 @@ define i32 @test_reduce_v16i32(<16 x i32> %a0) {
   ret i32 %13
 }
 
-define i16 @test_reduce_v32i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
@@ -1514,12 +1477,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1576,12 +1537,10 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1649,7 +1608,7 @@ define i16 @test_reduce_v32i16(<32 x i16> %a0) {
   ret i16 %16
 }
 
-define i8 @test_reduce_v64i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pminub %xmm3, %xmm1
@@ -1809,7 +1768,7 @@ define i8 @test_reduce_v64i8(<64 x i8> %a0) {
 ; Partial Vector Reductions
 ;
 
-define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
+define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v16i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1820,12 +1779,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1854,12 +1811,10 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1890,7 +1845,7 @@ define i16 @test_reduce_v16i16_v8i16(<16 x i16> %a0) {
   ret i16 %10
 }
 
-define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
+define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i16_v8i16:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -1901,12 +1856,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -1935,12 +1888,10 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    ## kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -1971,7 +1922,7 @@ define i16 @test_reduce_v32i16_v8i16(<32 x i16> %a0) {
   ret i16 %10
 }
 
-define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
+define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v32i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -2059,7 +2010,7 @@ define i8 @test_reduce_v32i8_v16i8(<32 x i8> %a0) {
   ret i8 %13
 }
 
-define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) {
+define i8 @test_reduce_v64i8_v16i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-LABEL: test_reduce_v64i8_v16i8:
 ; X86-SSE2:       ## %bb.0:
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
diff --git a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
index 8159468722596..65231c484db98 100644
--- a/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
+++ b/llvm/test/CodeGen/X86/intrinsic-cttz-elts.ll
@@ -21,11 +21,10 @@ define i8 @ctz_v8i16(<8 x i16> %a) {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    psubusw %xmm0, %xmm1
 ; CHECK-NEXT:    paddw %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    psubusw %xmm1, %xmm0
-; CHECK-NEXT:    paddw %xmm1, %xmm0
-; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    pextrw $1, %xmm1, %ecx
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    cmpw %cx, %ax
+; CHECK-NEXT:    cmoval %eax, %ecx
 ; CHECK-NEXT:    movl $8, %eax
 ; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
@@ -90,11 +89,10 @@ define i8 @ctz_v8i16_poison(<8 x i16> %a) {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    psubusw %xmm0, %xmm1
 ; CHECK-NEXT:    paddw %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    psubusw %xmm1, %xmm0
-; CHECK-NEXT:    paddw %xmm1, %xmm0
-; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    pextrw $1, %xmm1, %ecx
+; CHECK-NEXT:    movd %xmm1, %eax
+; CHECK-NEXT:    cmpw %cx, %ax
+; CHECK-NEXT:    cmoval %eax, %ecx
 ; CHECK-NEXT:    movl $8, %eax
 ; CHECK-NEXT:    subl %ecx, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/vector-extract-last-active.ll b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
index 1ccd1d11fc7aa..19f54edd05ac1 100644
--- a/llvm/test/CodeGen/X86/vector-extract-last-active.ll
+++ b/llvm/test/CodeGen/X86/vector-extract-last-active.ll
@@ -164,11 +164,10 @@ define i32 @extract_last_active_v8i32(<8 x i32> %a, <8 x i1> %c) {
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    psubusw %xmm0, %xmm1
 ; CHECK-NEXT:    paddw %xmm0, %xmm1
-; CHECK-NEXT:    movdqa %xmm1, %xmm0
-; CHECK-NEXT:    psrld $16, %xmm0
-; CHECK-NEXT:    psubusw %xmm1, %xmm0
-; CHECK-NEXT:    paddw %xmm1, %xmm0
-; CHECK-NEXT:    movd %xmm0, %ecx
+; CHECK-NEXT:    pextrw $1, %xmm1, %ecx
+; CHECK-NEXT:    movd %xmm1, %edx
+; CHECK-NEXT:    cmpw %cx, %dx
+; CHECK-NEXT:    cmoval %edx, %ecx
 ; CHECK-NEXT:    andl $7, %ecx
 ; CHECK-NEXT:    orl -40(%rsp,%rcx,4), %eax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smax.ll b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
index a302649decee8..a27756f71ca78 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smax.ll
@@ -19,10 +19,10 @@
 define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_v2i64:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-LABEL: test_v2i64:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
@@ -70,25 +70,13 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pextrd $1, %xmm2, %edx
 ; X86-SSE41-NEXT:    retl
 ;
-; X64-SSE41-LABEL: test_v2i64:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm5, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE41-NEXT:    movq %xmm3, %rax
-; X64-SSE41-NEXT:    retq
+; X64-SSE4-LABEL: test_v2i64:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE4-NEXT:    movq %xmm0, %rcx
+; X64-SSE4-NEXT:    cmpq %rax, %rcx
+; X64-SSE4-NEXT:    cmovgq %rcx, %rax
+; X64-SSE4-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v2i64:
 ; X86-SSE42:       # %bb.0:
@@ -100,15 +88,6 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
 ; X86-SSE42-NEXT:    retl
 ;
-; X64-SSE42-LABEL: test_v2i64:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
-; X64-SSE42-NEXT:    retq
-;
 ; X86-AVX-LABEL: test_v2i64:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -120,10 +99,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX-LABEL: test_v2i64:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, %rax
+; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX-NEXT:    cmpq %rax, %rcx
+; X64-AVX-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2i64:
@@ -164,9 +143,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -225,11 +204,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm4, %xmm0
 ; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; X86-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
@@ -245,34 +224,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE41-LABEL: test_v4i64:
 ; X64-SSE41:       # %bb.0:
 ; X64-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648]
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE41-NEXT:    movdqa %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm4
+; X64-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
+; X64-SSE41-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; X64-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    pand %xmm5, %xmm3
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE41-NEXT:    movq %xmm2, %rax
+; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT:    movq %xmm1, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v4i64:
@@ -293,11 +261,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v4i64:
@@ -318,10 +285,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -343,10 +310,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -381,56 +348,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
+; X86-SSE2-NEXT:    por %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm7
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm4, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -511,60 +478,60 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $16, %esp
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648]
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm4
+; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648]
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm7
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm7, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm1
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm3
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm1
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; X86-SSE41-NEXT:    movapd %xmm2, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm1, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -603,31 +570,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm6, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X64-SSE41-NEXT:    movapd %xmm2, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm2, %xmm5
+; X64-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm5
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm4, %xmm1
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm5
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm4
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm4, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm5, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE41-NEXT:    movq %xmm3, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v8i64:
@@ -669,11 +625,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v8i64:
@@ -704,10 +659,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -733,10 +688,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -774,124 +729,128 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movaps %xmm2, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm1
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm7
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm4
-; X86-SSE2-NEXT:    por %xmm1, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm6
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm7, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm4
+; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE2-NEXT:    pand %xmm4, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm5, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm6, %xmm5
 ; X86-SSE2-NEXT:    por %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm1
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm7
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
+; X86-SSE2-NEXT:    por %xmm7, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa (%esp), %xmm5 # 16-byte Reload
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm6
-; X86-SSE2-NEXT:    pand %xmm6, %xmm7
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm2
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm6
-; X86-SSE2-NEXT:    por %xmm7, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pand %xmm2, %xmm6
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm5
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
+; X86-SSE2-NEXT:    por %xmm6, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
+; X86-SSE2-NEXT:    por %xmm2, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm5, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
@@ -900,9 +859,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -1029,32 +988,31 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pushl %ebp
 ; X86-SSE41-NEXT:    movl %esp, %ebp
 ; X86-SSE41-NEXT:    andl $-16, %esp
-; X86-SSE41-NEXT:    subl $48, %esp
-; X86-SSE41-NEXT:    movaps %xmm2, (%esp) # 16-byte Spill
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm2
-; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    subl $32, %esp
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm5
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm7
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm7, %xmm0
-; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm5
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
-; X86-SSE41-NEXT:    movapd %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm7
+; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm4
+; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648]
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm7, %xmm6
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm1
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
@@ -1063,54 +1021,40 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm7
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE41-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm1
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; X86-SSE41-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm1, %xmm7
 ; X86-SSE41-NEXT:    movdqa 56(%ebp), %xmm1
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa (%esp), %xmm6 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm2
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm7, %xmm2
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm2
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
+; X86-SSE41-NEXT:    movdqa %xmm7, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm2
 ; X86-SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
@@ -1119,33 +1063,46 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
 ; X86-SSE41-NEXT:    movapd %xmm1, %xmm2
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm2
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
+; X86-SSE41-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm7
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm2
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
+; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -1236,31 +1193,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm2, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
-; X64-SSE41-NEXT:    movapd %xmm6, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm6, %xmm9
+; X64-SSE41-NEXT:    movapd %xmm9, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm9
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm2, %xmm1
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE41-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm9
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm3
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE41-NEXT:    movq %xmm7, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v16i64:
@@ -1271,31 +1217,31 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    subl $16, %esp
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm4
-; X86-SSE42-NEXT:    movdqa 56(%ebp), %xmm5
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; X86-SSE42-NEXT:    movdqa 40(%ebp), %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; X86-SSE42-NEXT:    movdqa 56(%ebp), %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm2
 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm3
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
-; X86-SSE42-NEXT:    movdqa 40(%ebp), %xmm3
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
 ; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
 ; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
@@ -1330,11 +1276,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    movapd %xmm6, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm7, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE42-NEXT:    movq %xmm7, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovgq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v16i64:
@@ -1343,25 +1288,25 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
 ; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vmovdqa 8(%ebp), %xmm3
-; X86-AVX1-NEXT:    vmovdqa 24(%ebp), %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm5
-; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm3, %xmm3
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm5
-; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm2, %xmm5
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm6
-; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
+; X86-AVX1-NEXT:    vmovdqa 8(%ebp), %xmm4
+; X86-AVX1-NEXT:    vmovdqa 24(%ebp), %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm6
+; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm5, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm7
+; X86-AVX1-NEXT:    vblendvpd %xmm7, %xmm6, %xmm5, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm6
+; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm5, %xmm3, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm5, %xmm3, %xmm1
-; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -1392,10 +1337,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1436,10 +1381,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovgq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -2798,21 +2743,18 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2837,14 +2779,11 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2939,21 +2878,18 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2983,14 +2919,11 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -3142,21 +3075,18 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -3198,14 +3128,11 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -3403,21 +3330,18 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -3479,14 +3403,11 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovgl %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-smin.ll b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
index c010290c0d60d..2d2397d0f2454 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-smin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-smin.ll
@@ -19,10 +19,10 @@
 define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_v2i64:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-LABEL: test_v2i64:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -70,25 +70,13 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pextrd $1, %xmm2, %edx
 ; X86-SSE41-NEXT:    retl
 ;
-; X64-SSE41-LABEL: test_v2i64:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm0, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm5, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE41-NEXT:    movq %xmm3, %rax
-; X64-SSE41-NEXT:    retq
+; X64-SSE4-LABEL: test_v2i64:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE4-NEXT:    movq %xmm0, %rcx
+; X64-SSE4-NEXT:    cmpq %rax, %rcx
+; X64-SSE4-NEXT:    cmovlq %rcx, %rax
+; X64-SSE4-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v2i64:
 ; X86-SSE42:       # %bb.0:
@@ -101,16 +89,6 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
 ; X86-SSE42-NEXT:    retl
 ;
-; X64-SSE42-LABEL: test_v2i64:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
-; X64-SSE42-NEXT:    retq
-;
 ; X86-AVX-LABEL: test_v2i64:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
@@ -122,10 +100,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ;
 ; X64-AVX-LABEL: test_v2i64:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX-NEXT:    vmovq %xmm0, %rax
+; X64-AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX-NEXT:    cmpq %rax, %rcx
+; X64-AVX-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2i64:
@@ -166,9 +144,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
@@ -226,9 +204,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm4, %xmm0
 ; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
 ; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
@@ -246,33 +224,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE41-LABEL: test_v4i64:
 ; X64-SSE41:       # %bb.0:
 ; X64-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = [2147483648,2147483648]
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE41-NEXT:    movdqa %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X64-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = [2147483648,2147483648]
+; X64-SSE41-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE41-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    pand %xmm5, %xmm3
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE41-NEXT:    movq %xmm2, %rax
+; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT:    movq %xmm1, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v4i64:
@@ -295,11 +263,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v4i64:
@@ -320,10 +287,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -345,10 +312,10 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -383,56 +350,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm6, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
-; X86-SSE2-NEXT:    por %xmm1, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm7
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm4, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -513,60 +480,59 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $16, %esp
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm4
-; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = [2147483648,2147483648]
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm4, %xmm6
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm6
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm6, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648]
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm6
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm7
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
 ; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm4, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm5, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm5
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm3
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm3, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm2, %xmm3
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm3
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm3, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm2
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
 ; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -604,31 +570,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm6, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm1
-; X64-SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm3, %xmm5
+; X64-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm1, %xmm4
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm4, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm5
-; X64-SSE41-NEXT:    movdqa %xmm5, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm4
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm4, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm5, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE41-NEXT:    movq %xmm3, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v8i64:
@@ -670,11 +625,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v8i64:
@@ -705,10 +659,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm2, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -734,10 +688,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -774,127 +728,124 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
-; X86-SSE2-NEXT:    subl $48, %esp
-; X86-SSE2-NEXT:    movaps %xmm1, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm5
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm6
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,0,2147483648,0]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
-; X86-SSE2-NEXT:    por %xmm2, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm0
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm7
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm7, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm7
+; X86-SSE2-NEXT:    por %xmm2, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
-; X86-SSE2-NEXT:    por %xmm7, %xmm0
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm4 # 16-byte Reload
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm7, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm5
-; X86-SSE2-NEXT:    por %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm7, %xmm1
 ; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
-; X86-SSE2-NEXT:    por %xmm2, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm5, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
-; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
@@ -903,9 +854,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -1033,29 +984,26 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    movl %esp, %ebp
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $48, %esp
-; X86-SSE41-NEXT:    movaps %xmm1, (%esp) # 16-byte Spill
+; X86-SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa 56(%ebp), %xmm1
+; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm6
 ; X86-SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = [2147483648,2147483648]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm5
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm6
+; X86-SSE41-NEXT:    movdqa 56(%ebp), %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE41-NEXT:    movapd %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
@@ -1063,94 +1011,93 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm2
+; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm7
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm3
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm2
-; X86-SSE41-NEXT:    movdqa (%esp), %xmm5 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm6, %xmm1
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm7
+; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm7
+; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm1, %xmm2
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, (%esp) # 16-byte Spill
+; X86-SSE41-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm6
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
-; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm7
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm2
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
+; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm7
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
+; X86-SSE41-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
+; X86-SSE41-NEXT:    movdqa %xmm7, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
+; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
+; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm4, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; X86-SSE41-NEXT:    pmovsxdq %xmm3, %xmm0
+; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm4
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm4, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -1240,31 +1187,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
-; X64-SSE41-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm1
-; X64-SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm7, %xmm9
+; X64-SSE41-NEXT:    movapd %xmm9, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm9
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm1, %xmm2
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm2, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE41-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm9
-; X64-SSE41-NEXT:    movdqa %xmm9, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm3
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE41-NEXT:    movq %xmm7, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v16i64:
@@ -1274,26 +1210,26 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    andl $-16, %esp
 ; X86-SSE42-NEXT:    subl $16, %esp
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE42-NEXT:    movdqa 40(%ebp), %xmm4
+; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm4
 ; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
-; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
-; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm5
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm5
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm4
 ; X86-SSE42-NEXT:    movdqa 56(%ebp), %xmm3
 ; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
+; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm3
+; X86-SSE42-NEXT:    movdqa 40(%ebp), %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
@@ -1334,11 +1270,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    movapd %xmm7, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE42-NEXT:    movq %xmm7, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovlq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v16i64:
@@ -1347,25 +1282,25 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
 ; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; X86-AVX1-NEXT:    vmovdqa 8(%ebp), %xmm5
-; X86-AVX1-NEXT:    vmovdqa 24(%ebp), %xmm6
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm7
-; X86-AVX1-NEXT:    vblendvpd %xmm7, %xmm4, %xmm6, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm6
-; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm3
+; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm2, %xmm3
+; X86-AVX1-NEXT:    vmovdqa 8(%ebp), %xmm4
+; X86-AVX1-NEXT:    vmovdqa 24(%ebp), %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm6
+; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm1, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm6
+; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm2, %xmm4
+; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm5, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm5, %xmm1
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
 ; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
@@ -1396,10 +1331,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm4, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1440,10 +1375,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovlq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -2802,21 +2737,18 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2841,14 +2773,11 @@ define i8 @test_v16i8(<16 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2943,21 +2872,18 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    retl
 ;
@@ -2987,14 +2913,11 @@ define i8 @test_v32i8(<32 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm1, %xmm2
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
 ; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movd %xmm2, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -3146,21 +3069,18 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -3202,14 +3122,11 @@ define i8 @test_v64i8(<64 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -3407,21 +3324,18 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psrld $16, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrlw $8, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movd %xmm2, %eax
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrld $16, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtb %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    movd %xmm2, %ecx
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    shrl $8, %eax
+; X86-SSE2-NEXT:    cmpb %al, %cl
+; X86-SSE2-NEXT:    cmovll %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -3483,14 +3397,11 @@ define i8 @test_v128i8(<128 x i8> %a0) nounwind {
 ; X64-SSE2-NEXT:    pand %xmm2, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm0, %xmm2
 ; X64-SSE2-NEXT:    por %xmm1, %xmm2
-; X64-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE2-NEXT:    psrlw $8, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
-; X64-SSE2-NEXT:    pand %xmm1, %xmm2
-; X64-SSE2-NEXT:    pandn %xmm0, %xmm1
-; X64-SSE2-NEXT:    por %xmm2, %xmm1
-; X64-SSE2-NEXT:    movd %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm2, %ecx
+; X64-SSE2-NEXT:    movl %ecx, %eax
+; X64-SSE2-NEXT:    shrl $8, %eax
+; X64-SSE2-NEXT:    cmpb %al, %cl
+; X64-SSE2-NEXT:    cmovll %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umax.ll b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
index 145c27e5eb976..0e78b804d9b8f 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umax.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umax.ll
@@ -19,10 +19,10 @@
 define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_v2i64:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-LABEL: test_v2i64:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
@@ -70,97 +70,66 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pextrd $1, %xmm2, %edx
 ; X86-SSE41-NEXT:    retl
 ;
-; X64-SSE41-LABEL: test_v2i64:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm5, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE41-NEXT:    movq %xmm3, %rax
-; X64-SSE41-NEXT:    retq
+; X64-SSE4-LABEL: test_v2i64:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE4-NEXT:    movq %xmm0, %rcx
+; X64-SSE4-NEXT:    cmpq %rax, %rcx
+; X64-SSE4-NEXT:    cmovaq %rcx, %rax
+; X64-SSE4-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v2i64:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
-; X64-SSE42-LABEL: test_v2i64:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    pxor %xmm2, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm2
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE42-NEXT:    movq %xmm3, %rax
-; X64-SSE42-NEXT:    retq
-;
 ; X86-AVX1-LABEL: test_v2i64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT:    # xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    # xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: test_v2i64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX1-NEXT:    # xmm1 = mem[0,0]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX2-LABEL: test_v2i64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: test_v2i64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2i64:
@@ -201,9 +170,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -262,11 +231,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm4, %xmm0
 ; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
 ; X86-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
@@ -282,34 +251,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE41-LABEL: test_v4i64:
 ; X64-SSE41:       # %bb.0:
 ; X64-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm2, %xmm4
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE41-NEXT:    movdqa %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm4
+; X64-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; X64-SSE41-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm2, %xmm0
+; X64-SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; X64-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    pand %xmm5, %xmm3
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE41-NEXT:    movq %xmm2, %rax
+; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT:    movq %xmm1, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v4i64:
@@ -321,9 +279,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm2, %xmm3
 ; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
@@ -334,19 +292,16 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE42-LABEL: test_v4i64:
 ; X64-SSE42:       # %bb.0:
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v4i64:
@@ -358,11 +313,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -374,15 +329,13 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX1-NEXT:    # xmm2 = mem[0,0]
 ; X64-AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm4
-; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X64-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -394,11 +347,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -409,15 +362,13 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm3
-; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X64-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -452,56 +403,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
+; X86-SSE2-NEXT:    por %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm7
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
-; X86-SSE2-NEXT:    por %xmm4, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -582,60 +533,60 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $16, %esp
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm4
+; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm5
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm5
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm7
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm7, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm3
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm1
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm1
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm3
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm1
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
+; X86-SSE41-NEXT:    movapd %xmm2, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm3
+; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm1, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -674,31 +625,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm6, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X64-SSE41-NEXT:    movapd %xmm2, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm2, %xmm5
+; X64-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm5
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm4, %xmm1
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm5
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm4
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm4, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm5, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE41-NEXT:    movq %xmm3, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v8i64:
@@ -710,26 +650,27 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm4
 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm6
 ; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm3
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
 ; X86-SSE42-NEXT:    movapd %xmm4, %xmm1
 ; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
 ; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
@@ -757,42 +698,39 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
-; X64-SSE42-NEXT:    movapd %xmm2, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm2, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm5
-; X64-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v8i64:
 ; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    # xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
+; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm5, %xmm3, %xmm3
 ; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -816,12 +754,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
-; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -837,11 +773,11 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
 ; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -856,15 +792,13 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm3
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X64-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm2
+; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -902,124 +836,128 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $32, %esp
-; X86-SSE2-NEXT:    movaps %xmm2, (%esp) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm1
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm7
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm4
-; X86-SSE2-NEXT:    por %xmm1, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm6
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm7, %xmm1
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm4
+; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE2-NEXT:    pand %xmm4, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm5, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,0,2,2]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm0, %xmm5
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pandn %xmm6, %xmm5
 ; X86-SSE2-NEXT:    por %xmm2, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm1
+; X86-SSE2-NEXT:    por %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm7
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
+; X86-SSE2-NEXT:    por %xmm7, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa (%esp), %xmm5 # 16-byte Reload
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm6
-; X86-SSE2-NEXT:    pand %xmm6, %xmm7
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm2
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm6
-; X86-SSE2-NEXT:    por %xmm7, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pand %xmm2, %xmm6
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm5
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
+; X86-SSE2-NEXT:    por %xmm6, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm5, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
+; X86-SSE2-NEXT:    por %xmm2, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
 ; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm5, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    por %xmm4, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm5
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm4
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm4, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
@@ -1028,9 +966,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
+; X86-SSE2-NEXT:    por %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -1157,32 +1095,31 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pushl %ebp
 ; X86-SSE41-NEXT:    movl %esp, %ebp
 ; X86-SSE41-NEXT:    andl $-16, %esp
-; X86-SSE41-NEXT:    subl $48, %esp
-; X86-SSE41-NEXT:    movaps %xmm2, (%esp) # 16-byte Spill
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm2
-; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm6
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    subl $32, %esp
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm5
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm7
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm7, %xmm0
-; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm5
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
-; X86-SSE41-NEXT:    movapd %xmm6, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm7
+; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm4
+; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm7, %xmm6
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,0,2,2]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm6
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm1
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
@@ -1191,54 +1128,40 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm7
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE41-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm1
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
+; X86-SSE41-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
+; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm1, %xmm7
 ; X86-SSE41-NEXT:    movdqa 56(%ebp), %xmm1
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa (%esp), %xmm6 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm2
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm7, %xmm2
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm6 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm2
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE41-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
+; X86-SSE41-NEXT:    movdqa %xmm7, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm5, %xmm2
 ; X86-SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
@@ -1247,33 +1170,46 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
 ; X86-SSE41-NEXT:    movapd %xmm1, %xmm2
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm2
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
+; X86-SSE41-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm7
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm2, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpgtd %xmm4, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm2
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
+; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm3
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; X86-SSE41-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -1364,31 +1300,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm2, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
-; X64-SSE41-NEXT:    movapd %xmm6, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm6, %xmm9
+; X64-SSE41-NEXT:    movapd %xmm9, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm1, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm1, %xmm9
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm9[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm2, %xmm1
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE41-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm9
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm9, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm3
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE41-NEXT:    movq %xmm7, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v16i64:
@@ -1399,60 +1324,59 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    subl $16, %esp
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm7
+; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm4
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    movdqa 40(%ebp), %xmm6
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm6, %xmm5
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm7
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
-; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm5
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm6
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm7
-; X86-SSE42-NEXT:    movapd 8(%ebp), %xmm1
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE42-NEXT:    movdqa %xmm7, %xmm1
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm1
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X86-SSE42-NEXT:    movdqa 56(%ebp), %xmm1
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm6
 ; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm6
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X86-SSE42-NEXT:    movapd %xmm1, %xmm2
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm2
-; X86-SSE42-NEXT:    movapd %xmm7, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm6, %xmm2
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm2
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X86-SSE42-NEXT:    movapd %xmm5, %xmm2
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm2
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm2
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm6, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; X86-SSE42-NEXT:    movapd %xmm5, %xmm2
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm2
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm2
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm2
 ; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm5, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm5
+; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; X86-SSE42-NEXT:    movd %xmm1, %eax
 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE42-NEXT:    movl %ebp, %esp
@@ -1501,17 +1425,14 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    xorpd %xmm9, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
-; X64-SSE42-NEXT:    movapd %xmm6, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm6, %xmm9
+; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm9
+; X64-SSE42-NEXT:    movdqa %xmm9, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE42-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm9
-; X64-SSE42-NEXT:    pcmpgtq %xmm9, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE42-NEXT:    movq %xmm7, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovaq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v16i64:
@@ -1519,47 +1440,51 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    pushl %ebp
 ; X86-AVX1-NEXT:    movl %esp, %ebp
 ; X86-AVX1-NEXT:    andl $-32, %esp
-; X86-AVX1-NEXT:    subl $32, %esp
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
+; X86-AVX1-NEXT:    subl $96, %esp
+; X86-AVX1-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) # 32-byte Spill
+; X86-AVX1-NEXT:    vmovaps %ymm0, (%esp) # 32-byte Spill
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    # xmm3 = mem[0,0]
+; X86-AVX1-NEXT:    vmovaps 24(%ebp), %xmm4
 ; X86-AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm5
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
 ; X86-AVX1-NEXT:    vxorps %xmm3, %xmm6, %xmm7
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
 ; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm6, %xmm4, %xmm4
-; X86-AVX1-NEXT:    vxorps 24(%ebp), %xmm3, %xmm6
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm7
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm7, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm6, %xmm5, %xmm5
-; X86-AVX1-NEXT:    vmovapd 24(%ebp), %xmm6
-; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm7, %xmm6, %xmm5
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm6
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm7
-; X86-AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; X86-AVX1-NEXT:    vmovaps 8(%ebp), %xmm7
-; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm0, %xmm2, %xmm0
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm7, %xmm2
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm6
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm6, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm7, %xmm1
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm5, %xmm0
+; X86-AVX1-NEXT:    vmovaps (%esp), %ymm2 # 32-byte Reload
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm7, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm6, %xmm0
+; X86-AVX1-NEXT:    vblendvpd %xmm0, %xmm7, %xmm5, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm4, %xmm5
 ; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm6
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm6, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm5, %xmm1
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm4, %xmm5, %xmm1
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vmovaps 8(%ebp), %xmm6
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm4, %xmm4
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm6, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm0, %xmm5, %xmm0
+; X86-AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm6, %xmm0
+; X86-AVX1-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %ymm6 # 32-byte Reload
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm6, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm5, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm2, %xmm6, %xmm1
 ; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm2
+; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm4, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    movl %ebp, %esp
@@ -1603,12 +1528,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm6, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm5, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm4, %xmm2, %xmm3
-; X64-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1620,28 +1543,28 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    subl $32, %esp
 ; X86-AVX2-NEXT:    vmovdqa 8(%ebp), %ymm4
 ; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %ymm3, %ymm4, %ymm5
+; X86-AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm6
+; X86-AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; X86-AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
+; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm1, %ymm4
 ; X86-AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm5
 ; X86-AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm6
 ; X86-AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
 ; X86-AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vpxor %ymm3, %ymm4, %ymm2
-; X86-AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm5
-; X86-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm5, %ymm2
-; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm1, %ymm4, %ymm1
-; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm1, %ymm2
-; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm0, %ymm4
-; X86-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm2
+; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm0, %ymm2
+; X86-AVX2-NEXT:    vpcmpgtq %ymm4, %ymm2, %ymm2
 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm2
 ; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
 ; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
+; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    movl %ebp, %esp
@@ -1669,12 +1592,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vxorpd %xmm4, %xmm0, %xmm3
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm4, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm4, %xmm2, %xmm3
-; X64-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovaq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -2434,11 +2355,10 @@ define i16 @test_v8i16(<8 x i16> %a0) nounwind {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    psubusw %xmm1, %xmm0
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    psubusw %xmm0, %xmm1
-; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    cmpw %ax, %cx
+; SSE2-NEXT:    cmoval %ecx, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
@@ -2506,11 +2426,10 @@ define i16 @test_v16i16(<16 x i16> %a0) nounwind {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; SSE2-NEXT:    paddw %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrld $16, %xmm0
-; SSE2-NEXT:    psubusw %xmm1, %xmm0
-; SSE2-NEXT:    paddw %xmm1, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; SSE2-NEXT:    movd %xmm1, %ecx
+; SSE2-NEXT:    cmpw %ax, %cx
+; SSE2-NEXT:    cmoval %ecx, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
@@ -2598,11 +2517,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -2622,11 +2540,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2755,11 +2672,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X86-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X86-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE2-NEXT:    psrld $16, %xmm0
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X86-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmoval %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -2787,11 +2703,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; X64-SSE2-NEXT:    psubusw %xmm0, %xmm1
 ; X64-SSE2-NEXT:    paddw %xmm0, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE2-NEXT:    psrld $16, %xmm0
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm0
-; X64-SSE2-NEXT:    paddw %xmm1, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm1, %eax
+; X64-SSE2-NEXT:    movd %xmm1, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmoval %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-umin.ll b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
index 84315e6c60895..bdf4a88aa3918 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-umin.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-umin.ll
@@ -19,10 +19,10 @@
 define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE2-LABEL: test_v2i64:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -52,10 +52,10 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-LABEL: test_v2i64:
 ; X86-SSE41:       # %bb.0:
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; X86-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
@@ -70,99 +70,67 @@ define i64 @test_v2i64(<2 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pextrd $1, %xmm2, %edx
 ; X86-SSE41-NEXT:    retl
 ;
-; X64-SSE41-LABEL: test_v2i64:
-; X64-SSE41:       # %bb.0:
-; X64-SSE41-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE41-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm0, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm2, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm5, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm2, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE41-NEXT:    movq %xmm3, %rax
-; X64-SSE41-NEXT:    retq
+; X64-SSE4-LABEL: test_v2i64:
+; X64-SSE4:       # %bb.0:
+; X64-SSE4-NEXT:    pextrq $1, %xmm0, %rax
+; X64-SSE4-NEXT:    movq %xmm0, %rcx
+; X64-SSE4-NEXT:    cmpq %rax, %rcx
+; X64-SSE4-NEXT:    cmovbq %rcx, %rax
+; X64-SSE4-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v2i64:
 ; X86-SSE42:       # %bb.0:
 ; X86-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
 ; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm0, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movd %xmm2, %eax
-; X86-SSE42-NEXT:    pextrd $1, %xmm2, %edx
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm2
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
+; X86-SSE42-NEXT:    movd %xmm3, %eax
+; X86-SSE42-NEXT:    pextrd $1, %xmm3, %edx
 ; X86-SSE42-NEXT:    retl
 ;
-; X64-SSE42-LABEL: test_v2i64:
-; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE42-NEXT:    pxor %xmm0, %xmm2
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE42-NEXT:    movq %xmm3, %rax
-; X64-SSE42-NEXT:    retq
-;
 ; X86-AVX1-LABEL: test_v2i64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX1-NEXT:    # xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX1-NEXT:    # xmm1 = mem[0,0]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: test_v2i64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovddup {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX1-NEXT:    # xmm1 = mem[0,0]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX2-LABEL: test_v2i64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: test_v2i64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_v2i64:
@@ -203,9 +171,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm0
 ; X86-SSE2-NEXT:    pandn %xmm1, %xmm3
 ; X86-SSE2-NEXT:    por %xmm0, %xmm3
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm2, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm0, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm4
@@ -263,9 +231,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm4, %xmm0
 ; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm2, %xmm3
 ; X86-SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
@@ -283,33 +251,23 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE41-LABEL: test_v4i64:
 ; X64-SSE41:       # %bb.0:
 ; X64-SSE41-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [9223372039002259456,9223372039002259456]
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE41-NEXT:    movdqa %xmm1, %xmm4
-; X64-SSE41-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE41-NEXT:    movdqa %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm2, %xmm3
-; X64-SSE41-NEXT:    movdqa %xmm3, %xmm4
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm4
-; X64-SSE41-NEXT:    pmovsxdq %xmm4, %xmm5
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X64-SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
+; X64-SSE41-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE41-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE41-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE41-NEXT:    pcmpgtd %xmm3, %xmm4
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    pand %xmm5, %xmm3
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm3, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE41-NEXT:    movq %xmm2, %rax
+; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X64-SSE41-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE41-NEXT:    movq %xmm1, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v4i64:
@@ -322,9 +280,9 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm2, %xmm0
 ; X86-SSE42-NEXT:    pxor %xmm3, %xmm2
 ; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm2
 ; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
@@ -336,21 +294,16 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-SSE42-LABEL: test_v4i64:
 ; X64-SSE42:       # %bb.0:
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm4
-; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm3, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm3
+; X64-SSE42-NEXT:    pxor %xmm1, %xmm0
+; X64-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
-; X64-SSE42-NEXT:    movapd %xmm1, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm3, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm2, %xmm3
-; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
-; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm1, %rax
+; X64-SSE42-NEXT:    movq %xmm1, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v4i64:
@@ -362,11 +315,11 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm3, %xmm4
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm1, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -378,31 +331,29 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    # xmm1 = mem[0,0]
 ; X64-AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; X64-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm4
-; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X64-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X64-AVX1-NEXT:    vpxor %xmm1, %xmm3, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
 ; X86-AVX2-LABEL: test_v4i64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vpxor %xmm2, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
+; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -413,15 +364,13 @@ define i64 @test_v4i64(<4 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [9223372036854775808,9223372036854775808]
 ; X64-AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm2
 ; X64-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X64-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm1, %xmm0, %xmm2
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm1, %xmm3, %xmm1
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm3, %xmm1
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm1, %xmm1
 ; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -456,56 +405,56 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
 ; X86-SSE2-NEXT:    subl $16, %esp
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm6, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
-; X86-SSE2-NEXT:    por %xmm1, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    pand %xmm1, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm2, %xmm1
-; X86-SSE2-NEXT:    por %xmm0, %xmm1
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm7
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm4
+; X86-SSE2-NEXT:    pand %xmm4, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm4
+; X86-SSE2-NEXT:    por %xmm0, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm7
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm7, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm5
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm0
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm1, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X86-SSE2-NEXT:    por %xmm4, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
@@ -586,60 +535,59 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $16, %esp
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm4
-; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm4, %xmm6
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm6
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm6, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm5
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm6
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm7
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
 ; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm4, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm5, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm4, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm5, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm1, %xmm5
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm3
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm6
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm3
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm3, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm2, %xmm3
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm3
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpgtd %xmm3, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm3, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm3, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
+; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm2
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
 ; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm3, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -677,31 +625,20 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm6, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm1
-; X64-SSE41-NEXT:    pcmpgtd %xmm4, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm4, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm6, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm3, %xmm5
+; X64-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm4, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm4, %xmm5
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm1, %xmm4
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm4, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm5, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm5
-; X64-SSE41-NEXT:    movdqa %xmm5, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm4
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm4, %xmm5
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm5, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE41-NEXT:    movq %xmm3, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v8i64:
@@ -710,33 +647,33 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    movl %esp, %ebp
 ; X86-SSE42-NEXT:    andl $-16, %esp
 ; X86-SSE42-NEXT:    subl $16, %esp
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
 ; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm5
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm4
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm4
 ; X86-SSE42-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm6
 ; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm1
 ; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; X86-SSE42-NEXT:    movd %xmm1, %eax
 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
@@ -746,59 +683,55 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ;
 ; X64-SSE42-LABEL: test_v8i64:
 ; X64-SSE42:       # %bb.0:
-; X64-SSE42-NEXT:    movdqa %xmm0, %xmm5
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [9223372036854775808,9223372036854775808]
+; X64-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
 ; X64-SSE42-NEXT:    movdqa %xmm0, %xmm6
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm6
 ; X64-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
-; X64-SSE42-NEXT:    movapd %xmm2, %xmm5
-; X64-SSE42-NEXT:    xorpd %xmm4, %xmm5
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
+; X64-SSE42-NEXT:    movapd %xmm2, %xmm4
+; X64-SSE42-NEXT:    xorpd %xmm5, %xmm4
 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm6
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm6
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm6
 ; X64-SSE42-NEXT:    movdqa %xmm3, %xmm0
-; X64-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X64-SSE42-NEXT:    pxor %xmm5, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
-; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm4, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
+; X64-SSE42-NEXT:    xorpd %xmm3, %xmm5
+; X64-SSE42-NEXT:    pcmpgtq %xmm4, %xmm5
+; X64-SSE42-NEXT:    movdqa %xmm5, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm3
-; X64-SSE42-NEXT:    movapd %xmm3, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm4, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm4
-; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
-; X64-SSE42-NEXT:    movdqa %xmm4, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    pextrq $1, %xmm3, %rax
+; X64-SSE42-NEXT:    movq %xmm3, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v8i64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    # xmm2 = mem[0,0]
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm5, %xmm6
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm3, %xmm5, %xmm3
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
-; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm3
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm4
+; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
+; X86-AVX1-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm3
 ; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm5
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX1-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    vzeroupper
@@ -822,12 +755,10 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -838,16 +769,16 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm4
 ; X86-AVX2-NEXT:    vpcmpgtq %ymm3, %ymm4, %ymm3
 ; X86-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm4, %xmm3
-; X86-AVX2-NEXT:    vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm3
-; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm1, %xmm2
-; X86-AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    vzeroupper
@@ -862,15 +793,13 @@ define i64 @test_v8i64(<8 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vblendvpd %ymm3, %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
 ; X64-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm4
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; X64-AVX2-NEXT:    vxorpd %xmm2, %xmm3, %xmm2
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
 ; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm3, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -907,127 +836,124 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pushl %ebp
 ; X86-SSE2-NEXT:    movl %esp, %ebp
 ; X86-SSE2-NEXT:    andl $-16, %esp
-; X86-SSE2-NEXT:    subl $48, %esp
-; X86-SSE2-NEXT:    movaps %xmm1, (%esp) # 16-byte Spill
+; X86-SSE2-NEXT:    subl $16, %esp
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm5
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm6
 ; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
 ; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm4, %xmm0
-; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm4
-; X86-SSE2-NEXT:    pand %xmm4, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm4
-; X86-SSE2-NEXT:    por %xmm2, %xmm4
-; X86-SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm5, %xmm0
+; X86-SSE2-NEXT:    movdqa 56(%ebp), %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm0, %xmm5
+; X86-SSE2-NEXT:    pand %xmm5, %xmm1
+; X86-SSE2-NEXT:    movdqa 24(%ebp), %xmm0
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm5
+; X86-SSE2-NEXT:    por %xmm1, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm5, %xmm0
-; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm0, %xmm2
-; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm7
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm2
-; X86-SSE2-NEXT:    por %xmm1, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm7, %xmm0
+; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
+; X86-SSE2-NEXT:    pand %xmm7, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm7
+; X86-SSE2-NEXT:    por %xmm2, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
 ; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,0,2,2]
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
 ; X86-SSE2-NEXT:    pand %xmm0, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm1, %xmm0
 ; X86-SSE2-NEXT:    movdqa 40(%ebp), %xmm6
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pandn %xmm5, %xmm0
-; X86-SSE2-NEXT:    por %xmm7, %xmm0
-; X86-SSE2-NEXT:    movdqa (%esp), %xmm4 # 16-byte Reload
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    pand %xmm0, %xmm5
+; X86-SSE2-NEXT:    pandn %xmm7, %xmm0
+; X86-SSE2-NEXT:    por %xmm5, %xmm0
 ; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm5
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm6, %xmm2
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm7
 ; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
 ; X86-SSE2-NEXT:    pand %xmm5, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm1, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm4
-; X86-SSE2-NEXT:    pandn %xmm6, %xmm5
-; X86-SSE2-NEXT:    por %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa 8(%ebp), %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm4
+; X86-SSE2-NEXT:    pandn %xmm6, %xmm2
+; X86-SSE2-NEXT:    por %xmm4, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm1
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm6
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm6
-; X86-SSE2-NEXT:    movdqa %xmm6, %xmm7
-; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm7
-; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm1, %xmm6
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    movdqa 72(%ebp), %xmm7
+; X86-SSE2-NEXT:    movdqa %xmm7, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpgtd %xmm1, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm1, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm1, %xmm4
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm4, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm1, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm7, %xmm1
 ; X86-SSE2-NEXT:    por %xmm5, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
-; X86-SSE2-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm4 # 16-byte Reload
-; X86-SSE2-NEXT:    movdqa %xmm4, %xmm5
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE2-NEXT:    movdqa %xmm5, %xmm6
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm6
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm5
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm5[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm7
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm7, %xmm5
-; X86-SSE2-NEXT:    pand %xmm5, %xmm2
-; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
-; X86-SSE2-NEXT:    por %xmm2, %xmm5
-; X86-SSE2-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE2-NEXT:    pcmpgtd %xmm4, %xmm6
+; X86-SSE2-NEXT:    pcmpeqd %xmm4, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm4, %xmm5
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm5, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm5
+; X86-SSE2-NEXT:    pand %xmm4, %xmm2
+; X86-SSE2-NEXT:    pandn %xmm1, %xmm4
+; X86-SSE2-NEXT:    por %xmm2, %xmm4
+; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X86-SSE2-NEXT:    pcmpgtd %xmm5, %xmm2
+; X86-SSE2-NEXT:    pcmpeqd %xmm5, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[0,0,2,2]
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,3,3]
+; X86-SSE2-NEXT:    pand %xmm5, %xmm6
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
+; X86-SSE2-NEXT:    por %xmm6, %xmm1
+; X86-SSE2-NEXT:    pand %xmm1, %xmm0
+; X86-SSE2-NEXT:    pandn %xmm4, %xmm1
+; X86-SSE2-NEXT:    por %xmm0, %xmm1
 ; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE2-NEXT:    pcmpgtd %xmm0, %xmm4
-; X86-SSE2-NEXT:    pcmpeqd %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[0,0,2,2]
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
-; X86-SSE2-NEXT:    pand %xmm0, %xmm2
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
-; X86-SSE2-NEXT:    por %xmm2, %xmm0
-; X86-SSE2-NEXT:    pand %xmm0, %xmm5
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm0
-; X86-SSE2-NEXT:    por %xmm5, %xmm0
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm3, %xmm2
-; X86-SSE2-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm3
 ; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pcmpgtd %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
@@ -1036,9 +962,9 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE2-NEXT:    pand %xmm2, %xmm3
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
 ; X86-SSE2-NEXT:    por %xmm3, %xmm2
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
-; X86-SSE2-NEXT:    por %xmm0, %xmm2
+; X86-SSE2-NEXT:    pand %xmm2, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm0, %xmm2
+; X86-SSE2-NEXT:    por %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movd %xmm2, %eax
 ; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
 ; X86-SSE2-NEXT:    movd %xmm0, %edx
@@ -1166,29 +1092,26 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    movl %esp, %ebp
 ; X86-SSE41-NEXT:    andl $-16, %esp
 ; X86-SSE41-NEXT:    subl $48, %esp
-; X86-SSE41-NEXT:    movaps %xmm1, (%esp) # 16-byte Spill
+; X86-SSE41-NEXT:    movaps %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-SSE41-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE41-NEXT:    movdqa 56(%ebp), %xmm1
+; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm6
 ; X86-SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm5
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm7
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm5
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    movdqa 24(%ebp), %xmm6
+; X86-SSE41-NEXT:    movdqa 56(%ebp), %xmm5
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE41-NEXT:    movapd %xmm5, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
 ; X86-SSE41-NEXT:    movdqa %xmm1, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
@@ -1196,94 +1119,93 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm2
+; X86-SSE41-NEXT:    movdqa 72(%ebp), %xmm3
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm7
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
-; X86-SSE41-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm3
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm2
-; X86-SSE41-NEXT:    movdqa (%esp), %xmm5 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm6, %xmm1
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm7
+; X86-SSE41-NEXT:    pcmpgtd %xmm1, %xmm7
+; X86-SSE41-NEXT:    pcmpeqd %xmm1, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm1, %xmm2
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm5, (%esp) # 16-byte Spill
+; X86-SSE41-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
+; X86-SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm3, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm7
+; X86-SSE41-NEXT:    movdqa 40(%ebp), %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm5, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm2
+; X86-SSE41-NEXT:    movdqa 8(%ebp), %xmm6
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
-; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm7
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
+; X86-SSE41-NEXT:    movdqa %xmm6, %xmm0
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movdqa %xmm3, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    movdqa %xmm2, %xmm7
 ; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm7
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm2
-; X86-SSE41-NEXT:    movapd %xmm6, %xmm0
+; X86-SSE41-NEXT:    por %xmm2, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm5, %xmm2
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm3
+; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movdqa {{[-0-9]+}}(%e{{[sb]}}p), %xmm5 # 16-byte Reload
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm1
-; X86-SSE41-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    movdqa %xmm5, %xmm1
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm5
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm6
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm6
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm7
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm7, %xmm0
+; X86-SSE41-NEXT:    movdqa (%esp), %xmm7 # 16-byte Reload
+; X86-SSE41-NEXT:    movdqa %xmm7, %xmm2
+; X86-SSE41-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm3
+; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
 ; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
-; X86-SSE41-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE41-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE41-NEXT:    movapd %xmm1, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
-; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,0,2,2]
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
-; X86-SSE41-NEXT:    pand %xmm0, %xmm1
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
-; X86-SSE41-NEXT:    por %xmm1, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm2
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
-; X86-SSE41-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE41-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE41-NEXT:    movapd %xmm0, %xmm1
+; X86-SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
+; X86-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[0,0,2,2]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; X86-SSE41-NEXT:    pand %xmm2, %xmm6
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X86-SSE41-NEXT:    por %xmm6, %xmm0
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
+; X86-SSE41-NEXT:    movapd %xmm3, %xmm0
+; X86-SSE41-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
 ; X86-SSE41-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE41-NEXT:    movdqa %xmm4, %xmm3
-; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm3
+; X86-SSE41-NEXT:    movdqa %xmm4, %xmm2
+; X86-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
 ; X86-SSE41-NEXT:    pcmpeqd %xmm0, %xmm4
-; X86-SSE41-NEXT:    pmovsxdq %xmm3, %xmm0
+; X86-SSE41-NEXT:    pmovsxdq %xmm2, %xmm0
 ; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
 ; X86-SSE41-NEXT:    pand %xmm0, %xmm4
-; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; X86-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
 ; X86-SSE41-NEXT:    por %xmm4, %xmm0
-; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
+; X86-SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; X86-SSE41-NEXT:    movd %xmm1, %eax
 ; X86-SSE41-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE41-NEXT:    movl %ebp, %esp
@@ -1373,31 +1295,20 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm1, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
-; X64-SSE41-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    movapd %xmm0, %xmm1
-; X64-SSE41-NEXT:    pcmpgtd %xmm2, %xmm1
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
-; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm2
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
+; X64-SSE41-NEXT:    xorpd %xmm7, %xmm9
+; X64-SSE41-NEXT:    movapd %xmm9, %xmm0
+; X64-SSE41-NEXT:    pcmpgtd %xmm2, %xmm0
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; X64-SSE41-NEXT:    pcmpeqd %xmm2, %xmm9
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[1,1,3,3]
+; X64-SSE41-NEXT:    pand %xmm1, %xmm2
+; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
 ; X64-SSE41-NEXT:    por %xmm2, %xmm0
 ; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE41-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE41-NEXT:    xorpd %xmm9, %xmm0
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE41-NEXT:    pxor %xmm1, %xmm9
-; X64-SSE41-NEXT:    movdqa %xmm9, %xmm2
-; X64-SSE41-NEXT:    pcmpgtd %xmm0, %xmm2
-; X64-SSE41-NEXT:    pmovsxdq %xmm2, %xmm3
-; X64-SSE41-NEXT:    pcmpeqd %xmm0, %xmm9
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[1,1,3,3]
-; X64-SSE41-NEXT:    pand %xmm3, %xmm4
-; X64-SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
-; X64-SSE41-NEXT:    por %xmm4, %xmm0
-; X64-SSE41-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE41-NEXT:    movq %xmm1, %rax
+; X64-SSE41-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE41-NEXT:    movq %xmm7, %rcx
+; X64-SSE41-NEXT:    cmpq %rax, %rcx
+; X64-SSE41-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE41-NEXT:    retq
 ;
 ; X86-SSE42-LABEL: test_v16i64:
@@ -1405,65 +1316,63 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-SSE42-NEXT:    pushl %ebp
 ; X86-SSE42-NEXT:    movl %esp, %ebp
 ; X86-SSE42-NEXT:    andl $-16, %esp
-; X86-SSE42-NEXT:    subl $32, %esp
-; X86-SSE42-NEXT:    movaps %xmm1, (%esp) # 16-byte Spill
-; X86-SSE42-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm5
-; X86-SSE42-NEXT:    movdqa 56(%ebp), %xmm6
-; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,2147483648,0,2147483648]
-; X86-SSE42-NEXT:    movdqa %xmm2, %xmm7
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm7
+; X86-SSE42-NEXT:    subl $16, %esp
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm4
+; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm6
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
+; X86-SSE42-NEXT:    movdqa %xmm0, %xmm5
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm5
 ; X86-SSE42-NEXT:    movdqa %xmm6, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
-; X86-SSE42-NEXT:    movdqa 24(%ebp), %xmm7
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
-; X86-SSE42-NEXT:    movdqa %xmm3, %xmm2
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm2
-; X86-SSE42-NEXT:    movdqa %xmm7, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm1
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm2
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm2
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm5, %xmm0
+; X86-SSE42-NEXT:    movdqa 56(%ebp), %xmm5
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm6
+; X86-SSE42-NEXT:    movdqa %xmm2, %xmm4
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm4
 ; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
+; X86-SSE42-NEXT:    movdqa 72(%ebp), %xmm4
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
+; X86-SSE42-NEXT:    movapd %xmm6, %xmm2
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm2
+; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; X86-SSE42-NEXT:    movdqa 40(%ebp), %xmm2
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
-; X86-SSE42-NEXT:    movdqa (%esp), %xmm1 # 16-byte Reload
-; X86-SSE42-NEXT:    movdqa %xmm1, %xmm3
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm3
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm6
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm6
 ; X86-SSE42-NEXT:    movdqa %xmm2, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
+; X86-SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
+; X86-SSE42-NEXT:    movdqa 8(%ebp), %xmm6
 ; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
-; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    movdqa %xmm6, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
+; X86-SSE42-NEXT:    pxor %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm5
-; X86-SSE42-NEXT:    movapd %xmm7, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE42-NEXT:    movapd %xmm6, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm4
+; X86-SSE42-NEXT:    movapd %xmm2, %xmm1
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm6
-; X86-SSE42-NEXT:    movapd %xmm6, %xmm1
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm1
-; X86-SSE42-NEXT:    movapd %xmm5, %xmm0
-; X86-SSE42-NEXT:    xorpd %xmm4, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
+; X86-SSE42-NEXT:    movapd %xmm5, %xmm1
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm1
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
 ; X86-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm5
-; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
-; X86-SSE42-NEXT:    movdqa %xmm5, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm4, %xmm0
-; X86-SSE42-NEXT:    pxor %xmm1, %xmm4
-; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm4
-; X86-SSE42-NEXT:    movdqa %xmm4, %xmm0
-; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm4
+; X86-SSE42-NEXT:    movapd %xmm4, %xmm0
+; X86-SSE42-NEXT:    xorpd %xmm3, %xmm0
+; X86-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[2,3,2,3]
+; X86-SSE42-NEXT:    pxor %xmm1, %xmm3
+; X86-SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
+; X86-SSE42-NEXT:    movdqa %xmm3, %xmm0
+; X86-SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm1
 ; X86-SSE42-NEXT:    movd %xmm1, %eax
 ; X86-SSE42-NEXT:    pextrd $1, %xmm1, %edx
 ; X86-SSE42-NEXT:    movl %ebp, %esp
@@ -1512,18 +1421,14 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-SSE42-NEXT:    xorpd %xmm8, %xmm0
 ; X64-SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm5, %xmm7
-; X64-SSE42-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm8, %xmm0
-; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
-; X64-SSE42-NEXT:    movapd %xmm7, %xmm0
-; X64-SSE42-NEXT:    xorpd %xmm8, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm1 = xmm7[2,3,2,3]
-; X64-SSE42-NEXT:    pxor %xmm1, %xmm8
-; X64-SSE42-NEXT:    pcmpgtq %xmm0, %xmm8
+; X64-SSE42-NEXT:    xorpd %xmm7, %xmm8
+; X64-SSE42-NEXT:    pcmpgtq %xmm2, %xmm8
 ; X64-SSE42-NEXT:    movdqa %xmm8, %xmm0
-; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm7, %xmm1
-; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    blendvpd %xmm0, %xmm6, %xmm7
+; X64-SSE42-NEXT:    pextrq $1, %xmm7, %rax
+; X64-SSE42-NEXT:    movq %xmm7, %rcx
+; X64-SSE42-NEXT:    cmpq %rax, %rcx
+; X64-SSE42-NEXT:    cmovbq %rcx, %rax
 ; X64-SSE42-NEXT:    retq
 ;
 ; X86-AVX1-LABEL: test_v16i64:
@@ -1534,44 +1439,44 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-AVX1-NEXT:    subl $32, %esp
 ; X86-AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [0,2147483648,0,2147483648]
 ; X86-AVX1-NEXT:    # xmm3 = mem[0,0]
-; X86-AVX1-NEXT:    vmovaps 8(%ebp), %xmm4
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm6
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm4, %xmm5, %xmm4
+; X86-AVX1-NEXT:    vblendvpd %xmm4, %xmm0, %xmm2, %xmm4
+; X86-AVX1-NEXT:    vmovaps 8(%ebp), %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm5, %xmm6
 ; X86-AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm7
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
-; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm1, %xmm4, %xmm4
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm6
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm7
-; X86-AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; X86-AVX1-NEXT:    vxorps 24(%ebp), %xmm3, %xmm7
-; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
-; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
-; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm0, %xmm2, %xmm6
-; X86-AVX1-NEXT:    vmovapd 24(%ebp), %xmm7
-; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm1, %xmm7, %xmm1
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm4, %xmm7
+; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm1, %xmm5, %xmm5
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm5, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
+; X86-AVX1-NEXT:    vblendvpd %xmm6, %xmm4, %xmm5, %xmm4
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; X86-AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
 ; X86-AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm2
-; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm7
-; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm6
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
+; X86-AVX1-NEXT:    vmovaps 24(%ebp), %xmm6
 ; X86-AVX1-NEXT:    vblendvpd %xmm5, %xmm0, %xmm2, %xmm0
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm6, %xmm2
+; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; X86-AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
+; X86-AVX1-NEXT:    vpcmpgtq %xmm5, %xmm2, %xmm2
+; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm6, %xmm1
 ; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
 ; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm5
 ; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm5, %xmm2
 ; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm6, %xmm1
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
-; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm6, %xmm4, %xmm1
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm2
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm4
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
-; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm4, %xmm1
 ; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
-; X86-AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm2, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm4, %xmm0, %xmm0
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
+; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-AVX1-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
+; X86-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
+; X86-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; X86-AVX1-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX1-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX1-NEXT:    movl %ebp, %esp
@@ -1615,12 +1520,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vpcmpgtq %xmm6, %xmm1, %xmm1
 ; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm5, %xmm0, %xmm0
-; X64-AVX1-NEXT:    vxorpd %xmm4, %xmm0, %xmm1
-; X64-AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-AVX1-NEXT:    vxorpd %xmm4, %xmm2, %xmm3
-; X64-AVX1-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
-; X64-AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX1-NEXT:    vmovq %xmm0, %rax
+; X64-AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX1-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX1-NEXT:    cmpq %rax, %rcx
+; X64-AVX1-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
@@ -1632,28 +1535,28 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X86-AVX2-NEXT:    subl $32, %esp
 ; X86-AVX2-NEXT:    vmovdqa 8(%ebp), %ymm4
 ; X86-AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648]
+; X86-AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm5
+; X86-AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm6
+; X86-AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
+; X86-AVX2-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
+; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm5
 ; X86-AVX2-NEXT:    vpxor %ymm3, %ymm4, %ymm6
 ; X86-AVX2-NEXT:    vpcmpgtq %ymm5, %ymm6, %ymm5
 ; X86-AVX2-NEXT:    vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; X86-AVX2-NEXT:    vpxor %ymm3, %ymm0, %ymm4
-; X86-AVX2-NEXT:    vpxor %ymm3, %ymm2, %ymm5
-; X86-AVX2-NEXT:    vpcmpgtq %ymm4, %ymm5, %ymm4
-; X86-AVX2-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
-; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm0, %ymm2
 ; X86-AVX2-NEXT:    vxorpd %ymm3, %ymm1, %ymm4
 ; X86-AVX2-NEXT:    vpcmpgtq %ymm2, %ymm4, %ymm2
 ; X86-AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm4
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm2
-; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm1, %xmm3
-; X86-AVX2-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
-; X86-AVX2-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm2, %xmm4
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm4, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
+; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; X86-AVX2-NEXT:    vxorpd %xmm3, %xmm2, %xmm3
+; X86-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
+; X86-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    vpextrd $1, %xmm0, %edx
 ; X86-AVX2-NEXT:    movl %ebp, %esp
@@ -1681,12 +1584,10 @@ define i64 @test_v16i64(<16 x i64> %a0) nounwind {
 ; X64-AVX2-NEXT:    vxorpd %xmm4, %xmm2, %xmm3
 ; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
 ; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT:    vxorpd %xmm4, %xmm0, %xmm1
-; X64-AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[2,3,2,3]
-; X64-AVX2-NEXT:    vxorpd %xmm4, %xmm2, %xmm3
-; X64-AVX2-NEXT:    vpcmpgtq %xmm1, %xmm3, %xmm1
-; X64-AVX2-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; X64-AVX2-NEXT:    vmovq %xmm0, %rax
+; X64-AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; X64-AVX2-NEXT:    vmovq %xmm0, %rcx
+; X64-AVX2-NEXT:    cmpq %rax, %rcx
+; X64-AVX2-NEXT:    cmovbq %rcx, %rax
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
@@ -2451,12 +2352,10 @@ define i16 @test_v8i16(<8 x i16> %a0) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; SSE2-NEXT:    psubw %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psubusw %xmm1, %xmm2
-; SSE2-NEXT:    psubw %xmm2, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    cmpw %ax, %cx
+; SSE2-NEXT:    cmovbl %ecx, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
@@ -2491,12 +2390,10 @@ define i16 @test_v16i16(<16 x i16> %a0) nounwind {
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; SSE2-NEXT:    psubw %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psrld $16, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psubusw %xmm1, %xmm2
-; SSE2-NEXT:    psubw %xmm2, %xmm0
-; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    cmpw %ax, %cx
+; SSE2-NEXT:    cmovbl %ecx, %eax
 ; SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; SSE2-NEXT:    ret{{[l|q]}}
 ;
@@ -2565,12 +2462,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -2595,12 +2490,10 @@ define i16 @test_v32i16(<32 x i16> %a0) nounwind {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;
@@ -2706,12 +2599,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X86-SSE2-NEXT:    psrld $16, %xmm1
-; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X86-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X86-SSE2-NEXT:    movd %xmm0, %eax
+; X86-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X86-SSE2-NEXT:    movd %xmm0, %ecx
+; X86-SSE2-NEXT:    cmpw %ax, %cx
+; X86-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X86-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-SSE2-NEXT:    movl %ebp, %esp
 ; X86-SSE2-NEXT:    popl %ebp
@@ -2748,12 +2639,10 @@ define i16 @test_v64i16(<64 x i16> %a0) nounwind {
 ; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
 ; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
-; X64-SSE2-NEXT:    psrld $16, %xmm1
-; X64-SSE2-NEXT:    movdqa %xmm0, %xmm2
-; X64-SSE2-NEXT:    psubusw %xmm1, %xmm2
-; X64-SSE2-NEXT:    psubw %xmm2, %xmm0
-; X64-SSE2-NEXT:    movd %xmm0, %eax
+; X64-SSE2-NEXT:    pextrw $1, %xmm0, %eax
+; X64-SSE2-NEXT:    movd %xmm0, %ecx
+; X64-SSE2-NEXT:    cmpw %ax, %cx
+; X64-SSE2-NEXT:    cmovbl %ecx, %eax
 ; X64-SSE2-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-SSE2-NEXT:    retq
 ;

From 73a48a6a7d8dbfb361f9119ceff895860a8391ad Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 14 May 2026 11:36:13 +0100
Subject: [PATCH 52/95] [LLVM][CodeGen] When expanding ISD::LRINT,
 non-deterministic results should be frozen. (#197435)

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 9e4f169cd4f3f..dca1a0b58c16a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4636,9 +4636,11 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue Arg = Node->getOperand(0);
     EVT ArgVT = Arg.getValueType();
     EVT ResVT = Node->getValueType(0);
-    SDLoc dl(Node);
-    SDValue RoundNode = DAG.getNode(ISD::FRINT, dl, ArgVT, Arg);
-    Results.push_back(DAG.getNode(ISD::FP_TO_SINT, dl, ResVT, RoundNode));
+    SDLoc DL(Node);
+    SDValue RoundNode = DAG.getNode(ISD::FRINT, DL, ArgVT, Arg);
+    SDValue ConvertNode = DAG.getNode(ISD::FP_TO_SINT, DL, ResVT, RoundNode);
+    // Non-deterministic results are equivalent to freeze poison.
+    Results.push_back(DAG.getFreeze(ConvertNode));
     break;
   }
   case ISD::ADDRSPACECAST:

From a2098f23395402f57de37b27170fcf2f3e08774d Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
Date: Thu, 14 May 2026 03:41:52 -0700
Subject: [PATCH 53/95] [AMDGPU] Fix disasm roundtrip for forced fp64 literal
 (#197583)

---
 .../Disassembler/AMDGPUDisassembler.cpp       |  8 +-
 llvm/test/MC/AMDGPU/literals.s                | 81 ++++++++-----------
 2 files changed, 40 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ea928386b80c6..9bd665bdf76aa 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1681,7 +1681,9 @@ AMDGPUDisassembler::decodeLiteralConstant(const MCInstrDesc &Desc,
   case AMDGPU::OPERAND_REG_IMM_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
-    Val <<= 32;
+    UseLit = AMDGPU::isInlinableLiteral64(Val << 32, HasInv2Pi);
+    if (!UseLit)
+      Val <<= 32;
     break;
   case AMDGPU::OPERAND_REG_IMM_INT64:
   case AMDGPU::OPERAND_REG_INLINE_C_INT64:
@@ -1713,6 +1715,10 @@ MCOperand AMDGPUDisassembler::decodeLiteral64Constant() const {
   }
 
   bool UseLit64 = Hi_32(Literal) == 0;
+
+  UseLit64 |= AMDGPU::isInlinableLiteral64(
+      Literal, STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm));
+
   return UseLit64 ? MCOperand::createExpr(AMDGPUMCExpr::createLit(
                         LitModifier::Lit64, Literal, getContext()))
                   : MCOperand::createImm(Literal);
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index e14d55ff62757..3508ed6c49130 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -194,13 +194,9 @@ v_fract_f64_e32 v[0:1], 1.0
 // GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
 // SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 
-
-// FIXME: Forced lit() encoding is not preserved after disasm
 v_fract_f64_e32 v[0:1], lit(1.0)
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX1250-ASM: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX1250-DIS: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 
@@ -535,8 +531,8 @@ v_fract_f64_e32 v[0:1], 1
 v_fract_f64_e32 v[0:1], lit(1)
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX1250-ASM: v_fract_f64_e32 v[0:1], lit(0x1)      ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX1250-DIS: v_fract_f64_e32 v[0:1], 0x1           ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX1250-ASM: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX1250-DIS: v_fract_f64_e32 v[0:1], 0x1             ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
 // SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 
@@ -949,11 +945,7 @@ s_mov_b64 s[0:1], 1
 // SICI: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x04,0x80,0xbe]
 
 s_mov_b64 s[0:1], lit(1)
-// GFX11: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX12: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX1250-ASM: s_mov_b64 s[0:1], lit(0x1)            ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX1250-DIS: s_mov_b64 s[0:1], lit(0x1)            ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
+// GFX8PLUS: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
 
 v_and_b32_e32 v0, 1, v1
@@ -1908,8 +1900,8 @@ v_sqrt_f64 v[2:3], lit(123.0)
 v_sqrt_f64 v[2:3], lit(123)
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX1250-ASM: v_sqrt_f64_e32 v[2:3], lit(0x7b)      ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX1250-DIS: v_sqrt_f64_e32 v[2:3], 0x7b           ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX1250-ASM: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX1250-DIS: v_sqrt_f64_e32 v[2:3], 0x7b             ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
@@ -1938,59 +1930,52 @@ v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8)
 // NOGFX89: :[[@LINE-4]]:24: error: not a valid operand.
 // NOSICI: :[[@LINE-5]]:24: error: not a valid operand.
 
-// FIXME: Forced lit() encoding is not preserved after disasm
 v_fract_f64_e32 v[0:1], lit64(1.0)
-// NOGFX11: :[[@LINE-1]]:25: error: lit64 is not supported on this GPU
-// NOGFX12: :[[@LINE-2]]:25: error: lit64 is not supported on this GPU
-// GFX1250-ASM: v_fract_f64_e32 v[0:1], lit64(0x3ff0000000000000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f]
-// GFX1250-DIS: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
-// NOGFX89: :[[@LINE-5]]:25: error: lit64 is not supported on this GPU
-// NOSICI: :[[@LINE-6]]:25: error: lit64 is not supported on this GPU
+// GFX1250: v_fract_f64_e32 v[0:1], lit64(0x3ff0000000000000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0x00,0x00,0x00,0x00,0xf0,0x3f]
+// NOGFX11: :[[@LINE-2]]:25: error: lit64 is not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:25: error: lit64 is not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: lit64 is not supported on this GPU
+// NOSICI: :[[@LINE-5]]:25: error: lit64 is not supported on this GPU
 
 v_fract_f64_e32 v[0:1], lit64(0x123456789)
-// NOGFX11: :[[@LINE-1]]:25: error: lit64 is not supported on this GPU
-// NOGFX12: :[[@LINE-2]]:25: error: lit64 is not supported on this GPU
 // GFX1250-ASM: v_fract_f64_e32 v[0:1], lit64(0x123456789) ; encoding: [0xfe,0x7c,0x00,0x7e,0x89,0x67,0x45,0x23,0x01,0x00,0x00,0x00]
-// GFX1250-DIS: v_fract_f64_e32 v[0:1], 0x123456789        ; encoding: [0xfe,0x7c,0x00,0x7e,0x89,0x67,0x45,0x23,0x01,0x00,0x00,0x00]
+// GFX1250-DIS: v_fract_f64_e32 v[0:1], 0x123456789     ; encoding: [0xfe,0x7c,0x00,0x7e,0x89,0x67,0x45,0x23,0x01,0x00,0x00,0x00]
+// NOGFX11: :[[@LINE-3]]:25: error: lit64 is not supported on this GPU
+// NOGFX12: :[[@LINE-4]]:25: error: lit64 is not supported on this GPU
 // NOGFX89: :[[@LINE-5]]:25: error: lit64 is not supported on this GPU
 // NOSICI: :[[@LINE-6]]:25: error: lit64 is not supported on this GPU
 
 v_fract_f64_e32 v[0:1], lit64(1e52)
-// NOGFX11: :[[@LINE-1]]:25: error: lit64 is not supported on this GPU
-// NOGFX12: :[[@LINE-2]]:25: error: lit64 is not supported on this GPU
 // GFX1250-ASM: v_fract_f64_e32 v[0:1], lit64(0x4ababa4714957d30) ; encoding: [0xfe,0x7c,0x00,0x7e,0x30,0x7d,0x95,0x14,0x47,0xba,0xba,0x4a]
-// GFX1250-DIS: v_fract_f64_e32 v[0:1], 0x4ababa4714957d30        ; encoding: [0xfe,0x7c,0x00,0x7e,0x30,0x7d,0x95,0x14,0x47,0xba,0xba,0x4a]
+// GFX1250-DIS: v_fract_f64_e32 v[0:1], 0x4ababa4714957d30 ; encoding: [0xfe,0x7c,0x00,0x7e,0x30,0x7d,0x95,0x14,0x47,0xba,0xba,0x4a]
+// NOGFX11: :[[@LINE-3]]:25: error: lit64 is not supported on this GPU
+// NOGFX12: :[[@LINE-4]]:25: error: lit64 is not supported on this GPU
 // NOGFX89: :[[@LINE-5]]:25: error: lit64 is not supported on this GPU
 // NOSICI: :[[@LINE-6]]:25: error: lit64 is not supported on this GPU
 
 v_add_nc_u64 v[0:1], v[0:1], lit(1)
-// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU (gfx1100): v_add_nc_u64
-// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU (gfx1200): v_add_nc_u64
-// GFX1250-ASM: v_add_nc_u64_e64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x28,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
-// GFX1250-DIS: v_add_nc_u64_e64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x28,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
-// NOGFX89: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOSICI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// GFX1250: v_add_nc_u64_e64 v[0:1], v[0:1], lit(0x1) ; encoding: [0x00,0x00,0x28,0xd5,0x00,0xff,0x01,0x02,0x01,0x00,0x00,0x00]
+// NOCI: :[[@LINE-2]]:1: error: instruction not supported on this GPU (bonaire): v_add_nc_u64
+// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU (gfx1100): v_add_nc_u64
+// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU (gfx1200): v_add_nc_u64
+// NOGFX9: :[[@LINE-5]]:1: error: instruction not supported on this GPU (gfx900): v_add_nc_u64
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU (tahiti): v_add_nc_u64
+// NOVI: :[[@LINE-7]]:1: error: instruction not supported on this GPU (tonga): v_add_nc_u64
 
 v_add_nc_u64 v[0:1], v[0:1], lit64(1)
-// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU (gfx1100): v_add_nc_u64
-// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU (gfx1200): v_add_nc_u64
-// NOGFX1250: :[[@LINE-3]]:36: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOCI: :[[@LINE-1]]:1: error: instruction not supported on this GPU (bonaire): v_add_nc_u64
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU (gfx1100): v_add_nc_u64
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU (gfx1200): v_add_nc_u64
+// NOGFX1250: :[[@LINE-4]]:36: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-5]]:1: error: instruction not supported on this GPU (gfx900): v_add_nc_u64
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU (tahiti): v_add_nc_u64
+// NOVI: :[[@LINE-7]]:1: error: instruction not supported on this GPU (tonga): v_add_nc_u64
 
 v_add_f64 v[0:1], v[0:1], lit(1)
-// NOGFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-2]]:31: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-3]]:31: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-4]]:31: error: invalid operand for instruction
-// NOSICI: :[[@LINE-5]]:31: error: invalid operand for instruction
+// NOGCN: :[[@LINE-1]]:31: error: invalid operand for instruction
 
 v_add_f64 v[0:1], v[0:1], lit(1.0)
-// NOGFX11: :[[@LINE-1]]:31: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-2]]:31: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-3]]:31: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-4]]:31: error: invalid operand for instruction
-// NOSICI: :[[@LINE-5]]:31: error: invalid operand for instruction
+// NOGCN: :[[@LINE-1]]:31: error: invalid operand for instruction
 
 v_add_f64 v[0:1], v[0:1], lit64(1.0)
 // NOGFX11: :[[@LINE-1]]:27: error: lit64 is not supported on this GPU

From b80e53d7a6e5fd269be5ac27d5cc7653eca4bc80 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Thu, 14 May 2026 11:45:00 +0100
Subject: [PATCH 54/95] [BOLT][AArch64] Account for hugify alignment in AArch64
 long jump layout (#195272)

When --hugify is used for a PIE, the final section allocation in
RewriteInstance::mapCodeSections aligns the address after the last
non-cold text section before laying out the following sections:

  for (BinarySection *Section : CodeSections) {
    Address = alignTo(Address, Section->getAlignment());
    Section->setOutputAddress(Address);
    Address += Section->getOutputSize();

    if (opts::Hugify && !BC->HasFixedLoadAddress &&
        Section->getName() == LastNonColdSectionName)
      Address = alignTo(Address, Section->getAlignment());
  }

The AArch64 long-jump pass doesn't model that gap in its tentative
layout, so a CBZ could be considered in range during stub insertion and
later become out of range when JITLink applied the final layout.

This patch mirrors the hugify alignment before assigning cold fragment
addresses so that range checks see the same hot-to-cold distance as the
final layout.

Assisted-by: Codex
---
 bolt/lib/Passes/LongJmp.cpp                   |  5 ++++
 .../long-jmp-hugify-fixup-out-of-range.s      | 27 ++++++++++++++-----
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index 7744cf08defa9..f085500fccbd3 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -367,6 +367,11 @@ LongJmpPass::tentativeLayoutRelocMode(const BinaryContext &BC,
   CurrentIndex = 0;
   bool ColdLayoutDone = false;
   auto runColdLayout = [&]() {
+    // Mirror the extra hugify alignment inserted by final section allocation
+    // after the last non-cold section. Account for it before assigning cold
+    // fragment addresses so range checks see the hot-to-cold gap.
+    if (opts::Hugify && !BC.HasFixedLoadAddress && !opts::HotFunctionsAtEnd)
+      DotAddress = alignTo(DotAddress, opts::AlignText);
     DotAddress = tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
     ColdLayoutDone = true;
     if (opts::HotFunctionsAtEnd)
diff --git a/bolt/test/AArch64/long-jmp-hugify-fixup-out-of-range.s b/bolt/test/AArch64/long-jmp-hugify-fixup-out-of-range.s
index 3763583887bd4..03c35e962e99e 100644
--- a/bolt/test/AArch64/long-jmp-hugify-fixup-out-of-range.s
+++ b/bolt/test/AArch64/long-jmp-hugify-fixup-out-of-range.s
@@ -1,13 +1,17 @@
-# Check that branches considered in-range during longjump
-# may go out of range at JITLink if hugify moves hot code.
+# The longjump pass may consider branch targets in range during tentative
+# layout and decide not to insert stubs for them. Later, final section
+# allocation may insert alignment padding after the last non-cold text section
+# when hugify is enabled. This moves the following cold section farther away,
+# resulting in relocation fixups going out of range at JITLink. Check that the
+# longjump pass accounts for this padding and inserts stubs when needed.
 
-# REQUIRES: system-linux, asserts
+# REQUIRES: system-linux, asserts, bolt-runtime, target=aarch64{{.*}}
 
 # RUN: %clang %cflags -Wl,-q %s -o %t
 # RUN: link_fdata --no-lbr %s %t %t.fdata
 # RUN: llvm-strip --strip-unneeded %t
-# RUN: not llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions --hugify 2>&1 \
-# RUN:   | FileCheck %s
+# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions --hugify
+# RUN: llvm-objdump -d %t.bolt | FileCheck %s
 
   .globl foo
   .type foo, %function
@@ -34,4 +38,15 @@ _start:
 ## Force relocation mode.
 .reloc 0, R_AARCH64_NONE
 
-# CHECK: BOLT-ERROR: JITLink failed: In graph in-memory object file, section .text: relocation target {{0x[0-9a-f]+}} {{.*}} is out of range of CondBranch19PCRel fixup at address {{0x[0-9a-f]+}} {{.*}}
+# CHECK: Disassembly of section .text:
+
+# CHECK: <foo>:
+# CHECK-NEXT:            {{.*}} cbnz x0, 0x[[ADDR0:[0-9a-f]+]] <{{.*}}>
+# CHECK-NEXT:            {{.*}} b 0x[[ADDR1:[0-9a-f]+]] <{{.*}}>
+# CHECK-NEXT: [[ADDR0]]: {{.*}} b 0x[[ADDR2:[0-9a-f]+]] <{{.*}}>
+
+# CHECK: Disassembly of section .text.cold:
+
+# CHECK: <foo.cold.0>:
+# CHECK-NEXT: [[ADDR2]]: {{.*}} mov x0, #0x1 // =1
+# CHECK-NEXT: [[ADDR1]]: {{.*}} ret

From 9a4faee1068c09efbf837cfb7b0f5693b24635f4 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Thu, 14 May 2026 11:45:37 +0100
Subject: [PATCH 55/95] [LLVM][Constants] Remove the option to disable vector
 ConstantFP support. (#197427)

Removes the command line options:
  -use-constant-fp-for-fixed-length-splat
  -use-constant-fp-for-scalable-splat
---
 llvm/docs/ReleaseNotes.md                     |  3 ++
 llvm/lib/IR/Constants.cpp                     | 30 +++++-----------
 llvm/test/Bitcode/constant-splat.ll           |  4 ---
 .../complex-deinterleaving-splat-scalable.ll  |  2 +-
 llvm/test/CodeGen/AArch64/neon-mov.ll         |  8 ++---
 .../AArch64/sve-fp-immediates-merging.ll      |  1 -
 ...treaming-mode-fixed-length-splat-vector.ll |  2 +-
 ...scalarize-static-array-of-float-vectors.ll |  2 --
 llvm/test/CodeGen/NVPTX/globals_init.ll       |  2 +-
 llvm/test/CodeGen/PowerPC/vec_constants.ll    |  6 ++--
 .../test/CodeGen/X86/combine-concatvectors.ll |  2 --
 llvm/test/CodeGen/X86/pr131389.ll             |  1 -
 llvm/test/CodeGen/X86/sse2.ll                 |  2 +-
 .../CodeGen/X86/vector-shuffle-128-v16.ll     |  2 +-
 llvm/test/CodeGen/X86/win_cst_pool.ll         |  2 +-
 .../test-interp-vec-insertelement.ll          |  2 +-
 .../NumericalStabilitySanitizer/basic.ll      |  3 --
 llvm/test/Transforms/Attributor/nofpclass.ll  |  9 ++---
 .../Transforms/InstCombine/X86/blend_x86.ll   |  4 +--
 llvm/test/Transforms/InstCombine/bitcast.ll   |  2 +-
 llvm/test/Transforms/InstCombine/cast.ll      |  4 +--
 .../Transforms/InstCombine/clamp-to-minmax.ll |  2 +-
 .../InstCombine/constant-vector-insert.ll     |  4 +--
 .../Transforms/InstCombine/extractelement.ll  |  8 ++---
 .../Transforms/InstCombine/fabs-fneg-fold.ll  |  1 -
 llvm/test/Transforms/InstCombine/fadd.ll      |  1 -
 llvm/test/Transforms/InstCombine/fdiv.ll      |  1 -
 llvm/test/Transforms/InstCombine/fmul.ll      |  1 -
 llvm/test/Transforms/InstCombine/fneg.ll      |  1 -
 llvm/test/Transforms/InstCombine/fpextend.ll  |  1 -
 .../InstCombine/load-store-forward.ll         | 36 +++++++------------
 .../InstCombine/memcmp-constant-fold.ll       |  4 +--
 .../InstSimplify/bitcast-vector-fold.ll       |  2 +-
 .../InstSimplify/constant-fold-fp-denormal.ll |  1 -
 .../InstSimplify/extract-element.ll           |  2 +-
 .../store-to-memset-constant-splat.ll         |  2 +-
 ...switch-to-lookup-table-vector-constants.ll |  1 -
 mlir/test/Target/LLVMIR/Import/constant.ll    |  2 +-
 38 files changed, 56 insertions(+), 107 deletions(-)

diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 108923bde1629..fffd696e59baf 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -125,6 +125,9 @@ Makes programs 10x faster by doing Special New Thing.
   in bitcode, e.g. `malloc`. Not yet supported on MachO or when using
   distributed ThinLTO. 
 
+* ``ConstantFP`` now supports vector types and is the canonical form returned by
+  ``ConstantVector::getSplat(C)`` when ``C`` is a scalar ``ConstantFP``.
+
 ### Changes to building LLVM
 
 ### Changes to TableGen
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 7d5ad8cbe1188..179fa15f6b5c5 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -39,15 +39,9 @@ using namespace PatternMatch;
 static cl::opt<bool> UseConstantIntForFixedLengthSplat(
     "use-constant-int-for-fixed-length-splat", cl::init(false), cl::Hidden,
     cl::desc("Use ConstantInt's native fixed-length vector splat support."));
-static cl::opt<bool> UseConstantFPForFixedLengthSplat(
-    "use-constant-fp-for-fixed-length-splat", cl::init(true), cl::Hidden,
-    cl::desc("Use ConstantFP's native fixed-length vector splat support."));
 static cl::opt<bool> UseConstantIntForScalableSplat(
     "use-constant-int-for-scalable-splat", cl::init(false), cl::Hidden,
     cl::desc("Use ConstantInt's native scalable vector splat support."));
-static cl::opt<bool> UseConstantFPForScalableSplat(
-    "use-constant-fp-for-scalable-splat", cl::init(true), cl::Hidden,
-    cl::desc("Use ConstantFP's native scalable vector splat support."));
 static cl::opt<bool> UseConstantPtrNullForFixedLengthSplat(
     "use-constant-ptrnull-for-fixed-length-splat", cl::init(true), cl::Hidden,
     cl::desc("Use ConstantPointerNull's native fixed-length vector splat "
@@ -1614,7 +1608,7 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   bool isZero = C->isNullValue();
   bool isUndef = isa<UndefValue>(C);
   bool isPoison = isa<PoisonValue>(C);
-  bool isSplatFP = UseConstantFPForFixedLengthSplat && isa<ConstantFP>(C);
+  bool isSplatFP = isa<ConstantFP>(C);
   bool isSplatInt = UseConstantIntForFixedLengthSplat && isa<ConstantInt>(C);
   bool isSplatByte = isa<ConstantByte>(C);
   bool isSplatPtrNull =
@@ -1665,24 +1659,23 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
       return ConstantPointerNull::get(VTy);
   }
 
+  if (auto *CB = dyn_cast<ConstantByte>(V))
+    return ConstantByte::get(V->getContext(), EC, CB->getValue());
+
+  if (auto *CFP = dyn_cast<ConstantFP>(V))
+    return ConstantFP::get(V->getContext(), EC, CFP->getValue());
+
   if (!EC.isScalable()) {
     // Maintain special handling of zero.
     if (!V->isNullValue()) {
       if (UseConstantIntForFixedLengthSplat && isa<ConstantInt>(V))
         return ConstantInt::get(V->getContext(), EC,
                                 cast<ConstantInt>(V)->getValue());
-      if (isa<ConstantByte>(V))
-        return ConstantByte::get(V->getContext(), EC,
-                                 cast<ConstantByte>(V)->getValue());
     }
 
-    if (UseConstantFPForFixedLengthSplat && isa<ConstantFP>(V))
-      return ConstantFP::get(V->getContext(), EC,
-                             cast<ConstantFP>(V)->getValue());
-
     // If this splat is compatible with ConstantDataVector, use it instead of
     // ConstantVector.
-    if ((isa<ConstantFP>(V) || isa<ConstantInt>(V) || isa<ConstantByte>(V)) &&
+    if (isa<ConstantInt>(V) &&
         ConstantDataSequential::isElementTypeCompatible(V->getType()))
       return ConstantDataVector::getSplat(EC.getKnownMinValue(), V);
 
@@ -1695,15 +1688,8 @@ Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
     if (UseConstantIntForScalableSplat && isa<ConstantInt>(V))
       return ConstantInt::get(V->getContext(), EC,
                               cast<ConstantInt>(V)->getValue());
-    if (isa<ConstantByte>(V))
-      return ConstantByte::get(V->getContext(), EC,
-                               cast<ConstantByte>(V)->getValue());
   }
 
-  if (UseConstantFPForScalableSplat && isa<ConstantFP>(V))
-    return ConstantFP::get(V->getContext(), EC,
-                           cast<ConstantFP>(V)->getValue());
-
   Type *VTy = VectorType::get(V->getType(), EC);
 
   if (V->isNullValue())
diff --git a/llvm/test/Bitcode/constant-splat.ll b/llvm/test/Bitcode/constant-splat.ll
index 6bc2b7cdb99ea..4647255d0d34f 100644
--- a/llvm/test/Bitcode/constant-splat.ll
+++ b/llvm/test/Bitcode/constant-splat.ll
@@ -1,11 +1,7 @@
 ; RUN: llvm-as -use-constant-int-for-fixed-length-splat \
-; RUN:         -use-constant-fp-for-fixed-length-splat \
 ; RUN:         -use-constant-int-for-scalable-splat \
-; RUN:         -use-constant-fp-for-scalable-splat \
 ; RUN:   < %s | llvm-dis -use-constant-int-for-fixed-length-splat \
-; RUN:                   -use-constant-fp-for-fixed-length-splat \
 ; RUN:                   -use-constant-int-for-scalable-splat \
-; RUN:                   -use-constant-fp-for-scalable-splat \
 ; RUN:   | FileCheck %s
 
 ; CHECK: @constant.splat.i1 = constant <1 x i1> splat (i1 true)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
index e7a00fc90e31d..a1f3070957fb0 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-splat-scalable.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
-; RUN: llc -use-constant-int-for-scalable-splat -use-constant-fp-for-scalable-splat < %s --mattr=+sve -o - | FileCheck %s
+; RUN: llc -use-constant-int-for-scalable-splat < %s --mattr=+sve -o - | FileCheck %s
 
 target triple = "aarch64"
 
diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll
index ca5af2c7c452e..b315c40778e7d 100644
--- a/llvm/test/CodeGen/AArch64/neon-mov.ll
+++ b/llvm/test/CodeGen/AArch64/neon-mov.ll
@@ -5,10 +5,10 @@
 ; RUN: llc -mattr=+neon,+fullfp16 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 ; This are copies of the above RUN lines but with vector constants enabled.
-; RUN: llc -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s -verify-machineinstrs -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16,CHECK-NOFP16-SD
-; RUN: llc -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s -verify-machineinstrs -mattr=+neon,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
-; RUN: llc -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat -mattr=+neon -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16,CHECK-NOFP16-GI
-; RUN: llc -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat -mattr=+neon,+fullfp16 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
+; RUN: llc -use-constant-int-for-fixed-length-splat < %s -verify-machineinstrs -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16,CHECK-NOFP16-SD
+; RUN: llc -use-constant-int-for-fixed-length-splat < %s -verify-machineinstrs -mattr=+neon,+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-SD
+; RUN: llc -use-constant-int-for-fixed-length-splat -mattr=+neon -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16,CHECK-NOFP16-GI
+; RUN: llc -use-constant-int-for-fixed-length-splat -mattr=+neon,+fullfp16 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-FP16,CHECK-FP16-GI
 
 target triple = "aarch64-none-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-fp-immediates-merging.ll b/llvm/test/CodeGen/AArch64/sve-fp-immediates-merging.ll
index 905d110e001c8..e1d883b0e7899 100644
--- a/llvm/test/CodeGen/AArch64/sve-fp-immediates-merging.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fp-immediates-merging.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
-; RUN: llc -use-constant-fp-for-scalable-splat < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
index d7b08e6fbd270..ff6e4771fd9f4 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-splat-vector.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mattr=+sve -force-streaming-compatible < %s | FileCheck %s
 ; RUN: llc -force-streaming-compatible < %s | FileCheck %s --check-prefix=NONEON-NOSVE
-; RUN: llc -force-streaming-compatible -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s --check-prefix=NONEON-NOSVE
+; RUN: llc -force-streaming-compatible -use-constant-int-for-fixed-length-splat < %s | FileCheck %s --check-prefix=NONEON-NOSVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/llvm/test/CodeGen/DirectX/scalarize-static-array-of-float-vectors.ll b/llvm/test/CodeGen/DirectX/scalarize-static-array-of-float-vectors.ll
index c77a3043303e5..420b77645f0b7 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-static-array-of-float-vectors.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-static-array-of-float-vectors.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays,function(scalarizer<load-store>),dxil-op-lower' -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s
-; RUN: opt -S -passes='dxil-data-scalarization,dxil-flatten-arrays,function(scalarizer<load-store>),dxil-op-lower' -mtriple=dxil-pc-shadermodel6.3-library -use-constant-fp-for-fixed-length-splat %s | FileCheck %s
-
 
 @StaticArr = internal constant [8 x <3 x float>] [<3 x float> zeroinitializer, <3 x float> splat (float 5.000000e-01), <3 x float> <float 1.000000e+00, float 5.000000e-01, float 5.000000e-01>, <3 x float> <float 5.000000e-01, float 1.000000e+00, float 5.000000e-01>, <3 x float> <float 5.000000e-01, float 5.000000e-01, float 1.000000e+00>, <3 x float> <float 5.000000e-01, float 1.000000e+00, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 5.000000e-01, float 1.000000e+00>, <3 x float> <float 1.000000e+00, float 1.000000e+00, float 5.000000e-01>], align 16
 
diff --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index 06d103b582996..45cb9e42bf033 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat | FileCheck %s
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -use-constant-int-for-fixed-length-splat | FileCheck %s
 
 ; Make sure the globals constant initializers are not prone to host endianess 
 ; issues.
diff --git a/llvm/test/CodeGen/PowerPC/vec_constants.ll b/llvm/test/CodeGen/PowerPC/vec_constants.ll
index 2b448fd05aeb5..07eae4cc3c1f8 100644
--- a/llvm/test/CodeGen/PowerPC/vec_constants.ll
+++ b/llvm/test/CodeGen/PowerPC/vec_constants.ll
@@ -3,9 +3,9 @@
 ; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi < %s | FileCheck %s --check-prefixes=CHECK,BE
 ; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,LE
 
-; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s --check-prefixes=CHECK,BE
-; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s --check-prefixes=CHECK,BE
-; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-linux-gnu -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s --check-prefixes=CHECK,LE
+; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64-unknown-linux-gnu -use-constant-int-for-fixed-length-splat < %s | FileCheck %s --check-prefixes=CHECK,BE
+; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64-ibm-aix-xcoff -vec-extabi -use-constant-int-for-fixed-length-splat < %s | FileCheck %s --check-prefixes=CHECK,BE
+; RUN: llc -verify-machineinstrs -O0 -mcpu=pwr7 -mtriple=powerpc64le-unknown-linux-gnu -use-constant-int-for-fixed-length-splat < %s | FileCheck %s --check-prefixes=CHECK,LE
 
 define void @test1(ptr %P1, ptr %P2, ptr %P3) nounwind {
 ; BE-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index bfc1a3c82de65..7237b02ca6b66 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefixes=CHECK,AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  -use-constant-fp-for-fixed-length-splat | FileCheck %s --check-prefixes=CHECK,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 -use-constant-fp-for-fixed-length-splat | FileCheck %s --check-prefixes=CHECK,AVX2
 
 define void @PR32957(ptr %in, ptr %out) {
 ; CHECK-LABEL: PR32957:
diff --git a/llvm/test/CodeGen/X86/pr131389.ll b/llvm/test/CodeGen/X86/pr131389.ll
index e1a538925b8cf..e53536b084b20 100644
--- a/llvm/test/CodeGen/X86/pr131389.ll
+++ b/llvm/test/CodeGen/X86/pr131389.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-- -use-constant-fp-for-fixed-length-splat | FileCheck %s
 
 define void @PR131389(ptr %p) {
 ; CHECK-LABEL: PR131389:
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index 6e77d3e4fd134..1bbd7853c9895 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -6,7 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX1,X64-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX,X64-AVX,AVX512,X64-AVX512
 
-; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat | FileCheck %s --check-prefixes=SSE,X86-SSE
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mattr=+sse2 -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=SSE,X86-SSE
 
 ; Tests for SSE2 and below, without SSE3+.
 
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 89cc7a638fa01..575f65e5e7fb3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -13,7 +13,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,AVX,AVX1OR2,XOP,XOPAVX2
 
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat | FileCheck %s --check-prefixes=ALL,SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ALL,SSE,SSE2
 
 define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
diff --git a/llvm/test/CodeGen/X86/win_cst_pool.ll b/llvm/test/CodeGen/X86/win_cst_pool.ll
index 097fe2a39abb6..ae9d9be4ed2cf 100644
--- a/llvm/test/CodeGen/X86/win_cst_pool.ll
+++ b/llvm/test/CodeGen/X86/win_cst_pool.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mattr=sse2 -mattr=avx | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-win32 -mattr=sse2 -mattr=avx | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=sse2 -mattr=avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=sse2 -mattr=avx --use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-windows-msvc -mattr=sse2 -mattr=avx --use-constant-int-for-fixed-length-splat | FileCheck %s
 ; GNU environment.
 ; RUN: llc < %s -mtriple=x86_64-win32-gnu -mattr=sse2 -mattr=avx | FileCheck -check-prefix=MINGW %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/ExecutionEngine/Interpreter/test-interp-vec-insertelement.ll b/llvm/test/ExecutionEngine/Interpreter/test-interp-vec-insertelement.ll
index 35e23c61f9ec1..77ea1284531e5 100644
--- a/llvm/test/ExecutionEngine/Interpreter/test-interp-vec-insertelement.ll
+++ b/llvm/test/ExecutionEngine/Interpreter/test-interp-vec-insertelement.ll
@@ -1,5 +1,5 @@
  ; RUN: %lli -jit-kind=mcjit -force-interpreter=true %s > /dev/null
- ; RUN: %lli -jit-kind=mcjit -force-interpreter=true -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat %s > /dev/null
+ ; RUN: %lli -jit-kind=mcjit -force-interpreter=true -use-constant-int-for-fixed-length-splat %s > /dev/null
 
 define i32 @main() {
  %v0 = insertelement <2 x i8> zeroinitializer, i8 1, i32 1
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 572b6adf5e75c..66c477609c234 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -2,9 +2,6 @@
 ; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DQQ
 ; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -S %s | FileCheck %s --check-prefixes=CHECK,DLQ
 
-; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dqq -nsan-truncate-fcmp-eq=false -use-constant-fp-for-fixed-length-splat -S %s | FileCheck %s --check-prefixes=CHECK,DQQ
-; RUN: opt -passes=nsan -nsan-shadow-type-mapping=dlq -nsan-truncate-fcmp-eq=false -use-constant-fp-for-fixed-length-splat -S %s | FileCheck %s --check-prefixes=CHECK,DLQ
-
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 declare float @declaration_only(float %a) sanitize_numerical_stability
diff --git a/llvm/test/Transforms/Attributor/nofpclass.ll b/llvm/test/Transforms/Attributor/nofpclass.ll
index 788558638a5d2..51e25c1ef09be 100644
--- a/llvm/test/Transforms/Attributor/nofpclass.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass.ll
@@ -1,9 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --version 2
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat=false -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,TUNIT,TUNIT-CV
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat=false -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,CGSCC,CGSCC-CV
-
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,TUNIT,TUNIT-CI
-; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -use-constant-fp-for-scalable-splat -S < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,CGSCC,CGSCC-CI
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT,TUNIT-CV
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC,CGSCC-CV
 
 declare nofpclass(nan) float @ret_nofpclass_nan()
 declare [2 x [3 x float]] @ret_array()
@@ -3974,7 +3971,5 @@ attributes #9 = { denormal_fpenv(ieee|dynamic) }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; CGSCC-CI: {{.*}}
 ; CGSCC-CV: {{.*}}
-; CHECK-CI: {{.*}}
-; CHECK-CV: {{.*}}
 ; TUNIT-CI: {{.*}}
 ; TUNIT-CV: {{.*}}
diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
index fca6840f166ab..36479f7b4e709 100644
--- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
+++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat=false -use-constant-fp-for-fixed-length-splat=false -S | FileCheck %s
-; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat=false -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -mtriple=x86_64-apple-macosx -mcpu=core-avx2 -use-constant-int-for-fixed-length-splat -S | FileCheck %s
 
 define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
 ; CHECK-LABEL: @constant_blendvpd(
diff --git a/llvm/test/Transforms/InstCombine/bitcast.ll b/llvm/test/Transforms/InstCombine/bitcast.ll
index d4c14c442190a..a81eb5000f63c 100644
--- a/llvm/test/Transforms/InstCombine/bitcast.ll
+++ b/llvm/test/Transforms/InstCombine/bitcast.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -passes=instcombine -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -use-constant-int-for-fixed-length-splat -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
diff --git a/llvm/test/Transforms/InstCombine/cast.ll b/llvm/test/Transforms/InstCombine/cast.ll
index b82cb23acc29b..8fdf9033b4e4a 100644
--- a/llvm/test/Transforms/InstCombine/cast.ll
+++ b/llvm/test/Transforms/InstCombine/cast.ll
@@ -2,8 +2,8 @@
 ; Tests to make sure elimination of casts is working correctly
 ; RUN: opt < %s -passes=instcombine -S -data-layout="E-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" | FileCheck %s --check-prefixes=ALL,BE
 ; RUN: opt < %s -passes=instcombine -S -data-layout="e-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" | FileCheck %s --check-prefixes=ALL,LE
-; RUN: opt < %s -passes=instcombine -S -data-layout="E-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ALL,BE
-; RUN: opt < %s -passes=instcombine -S -data-layout="e-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ALL,LE
+; RUN: opt < %s -passes=instcombine -S -data-layout="E-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ALL,BE
+; RUN: opt < %s -passes=instcombine -S -data-layout="e-p:64:64:64-p1:32:32:32-p2:64:64:64-p3:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128-n8:16:32:64" -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ALL,LE
 
 declare void @use_i8(i8)
 declare void @use_i32(i32)
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 8f121c6ca6cf2..e117d30d029f0 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -passes=instcombine -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat -S | FileCheck %s
+; RUN: opt < %s -passes=instcombine -use-constant-int-for-fixed-length-splat -S | FileCheck %s
 
 ; (X < C1) ? C1 : MIN(X, C2)
 define float @clamp_float_fast_ordered_strict_maxmin(float %x) {
diff --git a/llvm/test/Transforms/InstCombine/constant-vector-insert.ll b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
index 268854054bd7f..3f694d1d11e01 100644
--- a/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
+++ b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
@@ -2,9 +2,7 @@
 ; RUN: opt -S -passes=instcombine %s | FileCheck %s
 ; RUN: opt -S -passes=instcombine %s \
 ; RUN:   -use-constant-int-for-fixed-length-splat \
-; RUN    -use-constant-fp-for-fixed-length-splat \
-; RUN:   -use-constant-int-for-scalable-splat \
-; RUN:   -use-constant-fp-for-scalable-splat | FileCheck %s
+; RUN:   -use-constant-int-for-scalable-splat | FileCheck %s
 
 define <vscale x 4 x i32> @insert_div() {
 ; CHECK-LABEL: @insert_div(
diff --git a/llvm/test/Transforms/InstCombine/extractelement.ll b/llvm/test/Transforms/InstCombine/extractelement.ll
index 04a35e19fb0bb..5bd3978b39d82 100644
--- a/llvm/test/Transforms/InstCombine/extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/extractelement.ll
@@ -4,10 +4,10 @@
 ; RUN: opt < %s -passes=instcombine -S -data-layout="E-n64" | FileCheck %s --check-prefixes=ANY,ANYBE,BE64
 ; RUN: opt < %s -passes=instcombine -S -data-layout="E-n128" | FileCheck %s --check-prefixes=ANY,ANYBE,BE128
 
-; RUN: opt < %s -passes=instcombine -S -data-layout="e-n64" -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYLE,LE64
-; RUN: opt < %s -passes=instcombine -S -data-layout="e-n128" -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYLE,LE128
-; RUN: opt < %s -passes=instcombine -S -data-layout="E-n64" -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYBE,BE64
-; RUN: opt < %s -passes=instcombine -S -data-layout="E-n128" -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYBE,BE128
+; RUN: opt < %s -passes=instcombine -S -data-layout="e-n64" -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYLE,LE64
+; RUN: opt < %s -passes=instcombine -S -data-layout="e-n128" -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYLE,LE128
+; RUN: opt < %s -passes=instcombine -S -data-layout="E-n64" -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYBE,BE64
+; RUN: opt < %s -passes=instcombine -S -data-layout="E-n128" -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefixes=ANY,ANYBE,BE128
 
 define i32 @extractelement_out_of_range(<2 x i32> %x) {
 ; ANY-LABEL: @extractelement_out_of_range(
diff --git a/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll b/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll
index b77d6b51f9220..dd8d0aed3210e 100644
--- a/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll
+++ b/llvm/test/Transforms/InstCombine/fabs-fneg-fold.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes=instcombine %s | FileCheck %s
-; RUN: opt -S -passes=instcombine -use-constant-fp-for-fixed-length-splat %s | FileCheck %s
 
 define float @fabs_fneg_basic(float %x) {
 ; CHECK-LABEL: define float @fabs_fneg_basic(
diff --git a/llvm/test/Transforms/InstCombine/fadd.ll b/llvm/test/Transforms/InstCombine/fadd.ll
index b6d9360e5def9..0291fdcdd17cd 100644
--- a/llvm/test/Transforms/InstCombine/fadd.ll
+++ b/llvm/test/Transforms/InstCombine/fadd.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -passes=instcombine -use-constant-fp-for-fixed-length-splat -S | FileCheck %s
 
 declare void @use(float)
 declare void @use_vec(<2 x float>)
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 256bafd70ec9a..e992ba59b8574 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
-; RUN: opt -S -passes=instcombine -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s
 
 declare float @llvm.fabs.f32(float) nounwind readnone
 declare float @llvm.pow.f32(float, float) nounwind readnone
diff --git a/llvm/test/Transforms/InstCombine/fmul.ll b/llvm/test/Transforms/InstCombine/fmul.ll
index ef802e111ce1e..c93982f814e43 100644
--- a/llvm/test/Transforms/InstCombine/fmul.ll
+++ b/llvm/test/Transforms/InstCombine/fmul.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
-; RUN: opt -S -passes=instcombine -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s
 
 ; (-0.0 - X) * C => X * -C
 define float @neg_constant(float %x) {
diff --git a/llvm/test/Transforms/InstCombine/fneg.ll b/llvm/test/Transforms/InstCombine/fneg.ll
index dfcd7e992a18d..ee947130d9080 100644
--- a/llvm/test/Transforms/InstCombine/fneg.ll
+++ b/llvm/test/Transforms/InstCombine/fneg.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -passes=instcombine -use-constant-fp-for-fixed-length-splat -S | FileCheck %s
 
 declare float @llvm.ldexp.f32.i32(float, i32)
 declare <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float>, <2 x i32>)
diff --git a/llvm/test/Transforms/InstCombine/fpextend.ll b/llvm/test/Transforms/InstCombine/fpextend.ll
index ffbdf3c033874..28599613fd159 100644
--- a/llvm/test/Transforms/InstCombine/fpextend.ll
+++ b/llvm/test/Transforms/InstCombine/fpextend.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instcombine -S | FileCheck %s
-; RUN: opt < %s -passes=instcombine -use-constant-fp-for-fixed-length-splat -S | FileCheck %s
 
 define float @test(float %x) nounwind  {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/InstCombine/load-store-forward.ll b/llvm/test/Transforms/InstCombine/load-store-forward.ll
index 6c10c096143fa..6a0897ff75036 100644
--- a/llvm/test/Transforms/InstCombine/load-store-forward.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-forward.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=instcombine -use-constant-int-for-scalable-splat=false -use-constant-fp-for-scalable-splat=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,LITTLE,LITTLE-CV
-; RUN: opt -S -passes=instcombine -use-constant-int-for-scalable-splat -use-constant-fp-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,LITTLE,LITTLE-CI
-; RUN: opt -S -passes=instcombine -data-layout="E" -use-constant-int-for-scalable-splat=false -use-constant-fp-for-scalable-splat=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,BIG,BIG-CV
-; RUN: opt -S -passes=instcombine -data-layout="E" -use-constant-int-for-scalable-splat -use-constant-fp-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,BIG,BIG-CI
+; RUN: opt -S -passes=instcombine -use-constant-int-for-scalable-splat=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,LITTLE,LITTLE-CV
+; RUN: opt -S -passes=instcombine -use-constant-int-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,LITTLE,LITTLE-CI
+; RUN: opt -S -passes=instcombine -data-layout="E" -use-constant-int-for-scalable-splat=false < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CV,BIG,BIG-CV
+; RUN: opt -S -passes=instcombine -data-layout="E" -use-constant-int-for-scalable-splat < %s | FileCheck %s --check-prefixes=CHECK,CHECK-CI,BIG,BIG-CI
 
 define i8 @load_smaller_int(ptr %p) {
 ; LITTLE-LABEL: @load_smaller_int(
@@ -160,16 +160,10 @@ entry:
 }
 
 define float @load_f32_store_nxv4f32(ptr %a) {
-; CHECK-CV-LABEL: @load_f32_store_nxv4f32(
-; CHECK-CV-NEXT:  entry:
-; CHECK-CV-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
-; CHECK-CV-NEXT:    [[TMP0:%.*]] = load float, ptr [[A]], align 4
-; CHECK-CV-NEXT:    ret float [[TMP0]]
-;
-; CHECK-CI-LABEL: @load_f32_store_nxv4f32(
-; CHECK-CI-NEXT:  entry:
-; CHECK-CI-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
-; CHECK-CI-NEXT:    ret float 1.000000e+00
+; CHECK-LABEL: @load_f32_store_nxv4f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
+; CHECK-NEXT:    ret float 1.000000e+00
 ;
 entry:
   store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
@@ -178,16 +172,10 @@ entry:
 }
 
 define i32 @load_i32_store_nxv4f32(ptr %a) {
-; CHECK-CV-LABEL: @load_i32_store_nxv4f32(
-; CHECK-CV-NEXT:  entry:
-; CHECK-CV-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
-; CHECK-CV-NEXT:    [[LOAD:%.*]] = load i32, ptr [[A]], align 4
-; CHECK-CV-NEXT:    ret i32 [[LOAD]]
-;
-; CHECK-CI-LABEL: @load_i32_store_nxv4f32(
-; CHECK-CI-NEXT:  entry:
-; CHECK-CI-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
-; CHECK-CI-NEXT:    ret i32 1065353216
+; CHECK-LABEL: @load_i32_store_nxv4f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    store <vscale x 4 x float> splat (float 1.000000e+00), ptr [[A:%.*]], align 16
+; CHECK-NEXT:    ret i32 1065353216
 ;
 entry:
   store <vscale x 4 x float> splat (float 1.0), ptr %a, align 16
diff --git a/llvm/test/Transforms/InstCombine/memcmp-constant-fold.ll b/llvm/test/Transforms/InstCombine/memcmp-constant-fold.ll
index 4744be247d9bf..9e566270999db 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-constant-fold.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-constant-fold.ll
@@ -2,8 +2,8 @@
 ; RUN: opt < %s -passes=instcombine -S -data-layout=e-n32 | FileCheck %s --check-prefix=ALL --check-prefix=LE
 ; RUN: opt < %s -passes=instcombine -S -data-layout=E-n32 | FileCheck %s --check-prefix=ALL --check-prefix=BE
 
-; RUN: opt < %s -passes=instcombine -S -data-layout=e-n32 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat | FileCheck %s --check-prefix=ALL --check-prefix=LE
-; RUN: opt < %s -passes=instcombine -S -data-layout=E-n32 -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat | FileCheck %s --check-prefix=ALL --check-prefix=BE
+; RUN: opt < %s -passes=instcombine -S -data-layout=e-n32 -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefix=ALL --check-prefix=LE
+; RUN: opt < %s -passes=instcombine -S -data-layout=E-n32 -use-constant-int-for-fixed-length-splat | FileCheck %s --check-prefix=ALL --check-prefix=BE
 
 declare i32 @memcmp(ptr, ptr, i64)
 
diff --git a/llvm/test/Transforms/InstSimplify/bitcast-vector-fold.ll b/llvm/test/Transforms/InstSimplify/bitcast-vector-fold.ll
index 36ac86d4bb49d..88f34cabc6188 100644
--- a/llvm/test/Transforms/InstSimplify/bitcast-vector-fold.ll
+++ b/llvm/test/Transforms/InstSimplify/bitcast-vector-fold.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S -data-layout="e-p:32:32:32-i1:8:8-i8:8:8-f64:32:64-v64:64:64-v128:128:128" | FileCheck %s --check-prefixes=CHECK,LE
-; RUN: opt < %s -passes=instsimplify -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat -S -data-layout="e-p:32:32:32-i1:8:8-i8:8:8-f64:32:64-v64:64:64-v128:128:128" | FileCheck %s --check-prefixes=CHECK,LE
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S -data-layout="e-p:32:32:32-i1:8:8-i8:8:8-f64:32:64-v64:64:64-v128:128:128" | FileCheck %s --check-prefixes=CHECK,LE
 ; RUN: opt < %s -passes=instsimplify -S -data-layout="E-p:32:32:32-i1:8:8-i8:8:8-f64:32:64-v64:64:64-v128:128:128" | FileCheck %s --check-prefixes=CHECK,BE
 
 define <2 x i64> @test1() {
diff --git a/llvm/test/Transforms/InstSimplify/constant-fold-fp-denormal.ll b/llvm/test/Transforms/InstSimplify/constant-fold-fp-denormal.ll
index 0162c803e2460..24da8b2a03231 100644
--- a/llvm/test/Transforms/InstSimplify/constant-fold-fp-denormal.ll
+++ b/llvm/test/Transforms/InstSimplify/constant-fold-fp-denormal.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s
-; RUN: opt -S -passes=instsimplify -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s
 
 ; Test cases for denormal handling mode when constant folding floating point
 ; operations. Input and output modes are checked separately.
diff --git a/llvm/test/Transforms/InstSimplify/extract-element.ll b/llvm/test/Transforms/InstSimplify/extract-element.ll
index 7d30805f4fdc7..fa0970a0de4e8 100644
--- a/llvm/test/Transforms/InstSimplify/extract-element.ll
+++ b/llvm/test/Transforms/InstSimplify/extract-element.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
-; RUN: opt < %s -passes=instsimplify -use-constant-fp-for-fixed-length-splat -use-constant-int-for-fixed-length-splat -S | FileCheck %s
+; RUN: opt < %s -passes=instsimplify -use-constant-int-for-fixed-length-splat -S | FileCheck %s
 
 ; Weird Types
 
diff --git a/llvm/test/Transforms/MemCpyOpt/store-to-memset-constant-splat.ll b/llvm/test/Transforms/MemCpyOpt/store-to-memset-constant-splat.ll
index 55c68fad8dfaa..733d2b32753cc 100644
--- a/llvm/test/Transforms/MemCpyOpt/store-to-memset-constant-splat.ll
+++ b/llvm/test/Transforms/MemCpyOpt/store-to-memset-constant-splat.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -S -passes=memcpyopt < %s | FileCheck %s
-; RUN: opt -S -passes=memcpyopt -use-constant-int-for-fixed-length-splat -use-constant-fp-for-fixed-length-splat < %s | FileCheck %s
+; RUN: opt -S -passes=memcpyopt -use-constant-int-for-fixed-length-splat < %s | FileCheck %s
 
 define void @store_to_memst_vec_constant_int(ptr %p) {
 ; CHECK-LABEL: define void @store_to_memst_vec_constant_int(
diff --git a/llvm/test/Transforms/SimplifyCFG/AArch64/switch-to-lookup-table-vector-constants.ll b/llvm/test/Transforms/SimplifyCFG/AArch64/switch-to-lookup-table-vector-constants.ll
index 40d37bb943dcc..6e71f72efa380 100644
--- a/llvm/test/Transforms/SimplifyCFG/AArch64/switch-to-lookup-table-vector-constants.ll
+++ b/llvm/test/Transforms/SimplifyCFG/AArch64/switch-to-lookup-table-vector-constants.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
 ; RUN: opt -passes='simplifycfg<switch-to-lookup>' -S < %s | FileCheck %s
-; RUN: opt -passes='simplifycfg<switch-to-lookup>' -use-constant-fp-for-fixed-length-splat -S < %s | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
diff --git a/mlir/test/Target/LLVMIR/Import/constant.ll b/mlir/test/Target/LLVMIR/Import/constant.ll
index 042792d7d5c8a..cd4f79ce741bc 100644
--- a/mlir/test/Target/LLVMIR/Import/constant.ll
+++ b/mlir/test/Target/LLVMIR/Import/constant.ll
@@ -1,5 +1,5 @@
 ; RUN: mlir-translate -import-llvm -split-input-file %s | FileCheck %s
-; RUN: mlir-translate -import-llvm -split-input-file --use-constant-int-for-fixed-length-splat --use-constant-fp-for-fixed-length-splat %s | FileCheck %s
+; RUN: mlir-translate -import-llvm -split-input-file --use-constant-int-for-fixed-length-splat %s | FileCheck %s
 
 ; CHECK-LABEL: @int_constants
 define void @int_constants(i16 %arg0, i32 %arg1, i1 %arg2) {

From adb5802b494c98003155635fd33432f788557227 Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Thu, 14 May 2026 11:47:03 +0100
Subject: [PATCH 56/95] [LV] Avoid crashing for vector calls with scalar byte
 types (#197417)

If a parameter to a vector function variant is uniform or linear, check
whether the type is SCEVable first. Byte types aren't, so would cause
an assert. We could improve this later if needed.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |   6 +-
 .../byte-type-function-variants.ll            | 212 ++++++++++++++++++
 2 files changed, 216 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/byte-type-function-variants.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7213d4ae795ec..9632e03331411 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5011,7 +5011,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
           case VFParamKind::OMP_Uniform: {
             Value *ScalarParam = CI->getArgOperand(Param.ParamPos);
             // Make sure the scalar parameter in the loop is invariant.
-            if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
+            if (!PSE.getSE()->isSCEVable(ScalarParam->getType()) ||
+                !PSE.getSE()->isLoopInvariant(PSE.getSCEV(ScalarParam),
                                               TheLoop))
               ParamsOk = false;
             break;
@@ -5023,7 +5024,8 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
             // TODO: do we need to figure out the cost of an extract to get the
             // first lane? Or do we hope that it will be folded away?
             ScalarEvolution *SE = PSE.getSE();
-            if (!match(SE->getSCEV(ScalarParam),
+            if (!SE->isSCEVable(ScalarParam->getType()) ||
+                !match(SE->getSCEV(ScalarParam),
                        m_scev_AffineAddRec(
                            m_SCEV(), m_scev_SpecificSInt(Param.LinearStepOrPos),
                            m_SpecificLoop(TheLoop))))
diff --git a/llvm/test/Transforms/LoopVectorize/byte-type-function-variants.ll b/llvm/test/Transforms/LoopVectorize/byte-type-function-variants.ll
new file mode 100644
index 0000000000000..80f98522d2c8f
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/byte-type-function-variants.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S | FileCheck %s
+
+; Safe to vectorize normally.
+define void @byte_type_vector(ptr noalias %dst, ptr readonly %src, i64 %n) {
+; CHECK-LABEL: define void @byte_type_vector(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr b64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x b64>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x b64> @foo_vector(<2 x b64> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds b64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x b64> [[TMP1]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr b64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load b64, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call b64 @foo(b64 [[DATA]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds b64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store b64 [[CALL]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.src = getelementptr b64, ptr %src, i64 %iv
+  %data = load b64, ptr %gep.src, align 8
+  %call = call b64 @foo(b64 %data) #0
+  %gep.dst = getelementptr inbounds b64, ptr %dst, i64 %iv
+  store b64 %call, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Currently scalarized unnecessarily, since we can't form a SCEV from a byte type.
+; TODO: Perform the loop-invariance check differently; e.g. the uniform term
+;       here is a function parameter.
+define void @byte_type_uniform(ptr noalias %dst, ptr readonly %src, b64 %uniform, i64 %n) {
+; CHECK-LABEL: define void @byte_type_uniform(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], b64 [[UNIFORM:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr b64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x b64>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x b64> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x b64> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call b64 @bar(b64 [[TMP1]], b64 [[UNIFORM]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = call b64 @bar(b64 [[TMP2]], b64 [[UNIFORM]]) #[[ATTR1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x b64> poison, b64 [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x b64> [[TMP5]], b64 [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds b64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x b64> [[TMP6]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr b64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load b64, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[CALL:%.*]] = call b64 @bar(b64 [[DATA]], b64 [[UNIFORM]]) #[[ATTR1]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds b64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store b64 [[CALL]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.src = getelementptr b64, ptr %src, i64 %iv
+  %data = load b64, ptr %gep.src, align 8
+  %call = call b64 @bar(b64 %data, b64 %uniform) #1
+  %gep.dst = getelementptr inbounds b64, ptr %dst, i64 %iv
+  store b64 %call, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Must be scalarized, since we can't form a SCEV from a byte type.
+; We can't do arithmetic operations directly on byte types, but we can cast
+; from integers.
+define void @byte_type_linear(ptr noalias %dst, ptr readonly %src, i64 %n) {
+; CHECK-LABEL: define void @byte_type_linear(
+; CHECK-SAME: ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 2
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr b64, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x b64>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x b64> [[WIDE_LOAD]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x b64> [[WIDE_LOAD]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i64> [[VEC_IND]] to <2 x b64>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x b64> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x b64> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = call b64 @baz(b64 [[TMP1]], b64 [[TMP4]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call b64 @baz(b64 [[TMP2]], b64 [[TMP5]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x b64> poison, b64 [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <2 x b64> [[TMP8]], b64 [[TMP7]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds b64, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <2 x b64> [[TMP9]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <2 x i64> [[VEC_IND]], splat (i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr b64, ptr [[SRC]], i64 [[IV]]
+; CHECK-NEXT:    [[DATA:%.*]] = load b64, ptr [[GEP_SRC]], align 8
+; CHECK-NEXT:    [[LINEAR:%.*]] = bitcast i64 [[IV]] to b64
+; CHECK-NEXT:    [[CALL:%.*]] = call b64 @baz(b64 [[DATA]], b64 [[LINEAR]]) #[[ATTR2]]
+; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds b64, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store b64 [[CALL]], ptr [[GEP_DST]], align 4
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label %[[EXIT]], label %[[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %gep.src = getelementptr b64, ptr %src, i64 %iv
+  %data = load b64, ptr %gep.src, align 8
+  %linear = bitcast i64 %iv to b64
+  %call = call b64 @baz(b64 %data, b64 %linear) #2
+  %gep.dst = getelementptr inbounds b64, ptr %dst, i64 %iv
+  store b64 %call, ptr %gep.dst
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp eq i64 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %for.body
+
+exit:
+  ret void
+}
+
+; Scalar functions
+declare b64 @foo(b64)
+declare b64 @bar(b64, b64)
+declare b64 @baz(b64, b64)
+
+; Vector variants
+declare <2 x b64> @foo_vector(<2 x b64>)
+declare <2 x b64> @bar_vector_uniform(<2 x b64>, b64)
+declare <2 x b64> @baz_vector_linear(<2 x b64>, b64)
+
+; Mappings
+attributes #0 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2v_foo(foo_vector)" }
+attributes #1 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2vu_bar(bar_vector_uniform)" }
+attributes #2 = { nounwind "vector-function-abi-variant"="_ZGV_LLVM_N2vl1_baz(baz_vector_linear)" }

From 41416303ea0b2733db81d96c9e4d4cd36949c463 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Thu, 14 May 2026 11:47:56 +0100
Subject: [PATCH 57/95] [AArch64][clang] Improve -march= error message with
 many feature flags (#197441)

When calling `clang` with a large number of feature flags, the entire
argument is printed as an error message if one of the feature flags is
invalid.

For example, before this change, when providing a large number of features
to `-march=` with one of them invalid, an error message such as this is
printed:
```
  clang: error: unsupported argument 'armv9.6a+sme2+sme2p1+sve2+sve2p1+profile
  +crypto+aes+sha2+sha3+sm4+memtag+ssbs+bf16+i8mm+dotprod+ls64+rcpc3+brbe+gcs
  +faminmax+fp8+fp8fma+fp8dot4+fp8dot2+sme-f8f32+the+lut+lsui+pops+occmo
  +rme-gpc3+d128+invalidfeature'
```
and a user doesn't know which of the `+feature` flags is actually invalid.

After this change, the following error message is printed:
```
  clang: error: unsupported argument '+invalidfeature' to option '-march='
```
clearly printing out the first invalid feature flag found.
---
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp | 39 ++++++++++++++------
 clang/test/Driver/aarch64-march.c            |  7 ++++
 2 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 93fdbd17d1a43..f42465da14a71 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -137,8 +137,10 @@ aarch64::getAArch64TargetTuneCPU(const llvm::opt::ArgList &Args,
 }
 
 // Decode AArch64 features from string like +[no]featureA+[no]featureB+...
-static bool DecodeAArch64Features(const Driver &D, StringRef text,
-                                  llvm::AArch64::ExtensionSet &Extensions) {
+static bool
+DecodeAArch64Features(const Driver &D, StringRef text,
+                      llvm::AArch64::ExtensionSet &Extensions,
+                      std::optional<std::string> *InvalidArg = nullptr) {
   SmallVector<StringRef, 8> Split;
   text.split(Split, StringRef("+"), -1, false);
 
@@ -147,8 +149,11 @@ static bool DecodeAArch64Features(const Driver &D, StringRef text,
       D.Diag(clang::diag::err_drv_no_neon_modifier);
       continue;
     }
-    if (!Extensions.parseModifier(Feature))
+    if (!Extensions.parseModifier(Feature)) {
+      if (InvalidArg)
+        InvalidArg->emplace(("+" + Feature).str());
       return false;
+    }
   }
 
   return true;
@@ -203,7 +208,8 @@ static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
 static bool
 getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
                                 const ArgList &Args,
-                                llvm::AArch64::ExtensionSet &Extensions) {
+                                llvm::AArch64::ExtensionSet &Extensions,
+                                std::optional<std::string> &InvalidArg) {
   std::string MarchLowerCase = March.lower();
   std::pair<StringRef, StringRef> Split = StringRef(MarchLowerCase).split("+");
 
@@ -212,13 +218,15 @@ getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
 
   const llvm::AArch64::ArchInfo *ArchInfo =
       llvm::AArch64::parseArch(Split.first);
-  if (!ArchInfo)
+  if (!ArchInfo) {
+    InvalidArg.emplace(Split.first.str());
     return false;
+  }
 
   Extensions.addArchDefaults(*ArchInfo);
 
   if ((Split.second.size() &&
-       !DecodeAArch64Features(D, Split.second, Extensions)))
+       !DecodeAArch64Features(D, Split.second, Extensions, &InvalidArg)))
     return false;
 
   return true;
@@ -253,6 +261,7 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
                                        bool ForAS, bool ForMultilib) {
   Arg *A;
   bool success = true;
+  std::optional<std::string> InvalidArg;
   llvm::StringRef WaMArch;
   llvm::AArch64::ExtensionSet Extensions;
   if (ForAS)
@@ -265,10 +274,11 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
   // "-Xassembler -march" is detected. Otherwise it may return false
   // and causes Clang to error out.
   if (!WaMArch.empty())
-    success = getAArch64ArchFeaturesFromMarch(D, WaMArch, Args, Extensions);
+    success = getAArch64ArchFeaturesFromMarch(D, WaMArch, Args, Extensions,
+                                              InvalidArg);
   else if ((A = Args.getLastArg(options::OPT_march_EQ)))
-    success =
-        getAArch64ArchFeaturesFromMarch(D, A->getValue(), Args, Extensions);
+    success = getAArch64ArchFeaturesFromMarch(D, A->getValue(), Args,
+                                              Extensions, InvalidArg);
   else if ((A = Args.getLastArg(options::OPT_mcpu_EQ)))
     success =
         getAArch64ArchFeaturesFromMcpu(D, A->getValue(), Args, Extensions);
@@ -277,7 +287,8 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
         D, getAArch64TargetCPUByTriple(Triple), Args, Extensions);
   else
     // Default to 'A' profile if the architecture is not specified.
-    success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions);
+    success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions,
+                                              InvalidArg);
 
   if (success && (A = Args.getLastArg(options::OPT_mtune_EQ)))
     success = getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args);
@@ -292,10 +303,14 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     auto Diag = D.Diag(diag::err_drv_unsupported_option_argument);
     // If "-Wa,-march=" is used, 'WaMArch' will contain the argument's value,
     // while 'A' is uninitialized. Only dereference 'A' in the other case.
-    if (!WaMArch.empty())
+    if (!WaMArch.empty() && InvalidArg)
+      Diag << "-march=" << *InvalidArg;
+    else if (!WaMArch.empty())
       Diag << "-march=" << WaMArch;
-    else
+    else if (!InvalidArg)
       Diag << A->getSpelling() << A->getValue();
+    else
+      Diag << A->getSpelling() << *InvalidArg;
   }
 
   // -mgeneral-regs-only disables all floating-point features.
diff --git a/clang/test/Driver/aarch64-march.c b/clang/test/Driver/aarch64-march.c
index 132cea0e2c624..d2cea0e50def3 100644
--- a/clang/test/Driver/aarch64-march.c
+++ b/clang/test/Driver/aarch64-march.c
@@ -29,3 +29,10 @@
 // RUN: %clang --target=aarch64_be -mbig-endian -march=ARMv8.1a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV81A-BE %s
 // RUN: %clang --target=aarch64_be -mbig-endian -march=ARMV8.1-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV81A-BE %s
 // GENERICV81A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v8.1a"
+
+// ================== Check whether -march diagnoses the first invalid argument.
+// RUN: not %clang --target=aarch64 -march=armv9.6-a+sme2+sme2p1+sve2+sve2p1+badfeature+aes+sha2+memtag+bf16 %s -### -c 2>&1 | FileCheck -check-prefix=INVALID-EXT %s
+// RUN: not %clang --target=aarch64 -march=notanarch+sme2 %s -### -c 2>&1 | FileCheck -check-prefix=INVALID-ARCH %s
+// INVALID-EXT: error: unsupported argument '+badfeature' to option '-march='
+// INVALID-EXT-NOT: armv9.6-a+sme2+sme2p1+sve2+sve2p1+badfeature+aes+sha2+memtag+bf16
+// INVALID-ARCH: error: unsupported argument 'notanarch' to option '-march='

From cc9176311ce572ccdfced32da3e483196bae85ad Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey@raspberryginger.com>
Date: Thu, 14 May 2026 10:47:59 +0000
Subject: [PATCH 58/95] Fix misspelling of 'llvm' (#197649)

---
 .github/CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 6599dea8a3b34..6537af30fb970 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -11,7 +11,7 @@
 # See https://llvm.org/docs/DeveloperPolicy.html#maintainers as well as the
 # Maintainers.* files in the the respective subproject directories.
 
-/libc/ @llbm/reviewers-libc
+/libc/ @llvm/reviewers-libc
 /libcxx/ @llvm/reviewers-libcxx
 /libcxxabi/ @llvm/reviewers-libcxxabi
 /libunwind/ @llvm/reviewers-libunwind

From 146533e064441415210092c23832064ac4b38e80 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 14 May 2026 12:51:27 +0200
Subject: [PATCH 59/95] [libc] Fix for SYS_mmap2 offset computation (#197413)

The comment implies that the offset argument is a multiple of page size,
but

[this](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/csky/kernel/syscall.c#L25)
[is](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/parisc/kernel/sys_parisc.c#L193)
[not](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/microblaze/kernel/sys_microblaze.c#L50)
[the](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/riscv/kernel/sys_riscv.c#L48)
[case](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/arm64/kernel/sys32.c#L47)
[for](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/sparc/kernel/sys_sparc_32.c#L113)
[almost](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/m68k/kernel/sys_m68k.c#L44)
[every](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/powerpc/kernel/syscalls.c#L56)
[architecture](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/sh/kernel/sys_sh.c#L46)
[supported](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/powerpc/kernel/syscalls.c#L56)
[by](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/mips/kernel/syscall.c#L76)
[linux](https://github.com/torvalds/linux/blob/1d5dcaa3bd65f2e8c9baa14a393d3a2dc5db7524/arch/arm/kernel/entry-common.S#L410).
Most architectures just use fixed 4k units instead.

x86 code does not reference the pages sizes anywhere, but x86 always has
a 4k (base) page size. The same is probably true for Nios II, though
information is a bit scarce on that one. I believe openrisc uses fixed
8k units, but this should be confirmed when/if we're porting to that
architecture.

Itanium had configurable page sizes and did not use fixed mmap2 scaling,
but itanium support was removed from the kernel two years ago. Hexagon
and ARC may be the only currently supported architectures that do the
same, but this is also best confirmed during porting.

This patch is a no-op for any architecture we currently support, but it
makes it clear that this behavior is deliberate, and not something to be
fixed. This code was introduced way back in
[2019](https://reviews.llvm.org/D71634) and while the review contained a
discussion on page sizes, it revolved around the right way to obtain it,
rather than the question of whether it is actually necessary.
---
 .../__support/OSUtil/linux/syscall_wrappers/mmap.h    | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/mmap.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/mmap.h
index 983973e70e205..15a93de9e40bf 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/mmap.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/mmap.h
@@ -14,7 +14,6 @@
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
-#include <linux/param.h> // For EXEC_PAGESIZE
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
@@ -25,12 +24,12 @@ LIBC_INLINE ErrorOr<void *> mmap(void *addr, size_t size, int prot, int flags,
   // TODO: Perform POSIX-prescribed argument validation not done by the
   // linux syscall.
 
-  // EXEC_PAGESIZE is used for the page size. While this is OK for x86_64,
-  // it might not be correct in general.
-  // TODO: Use pagesize read from the ELF aux vector instead of
-  // EXEC_PAGESIZE.
 #ifdef SYS_mmap2
-  offset /= EXEC_PAGESIZE;
+  // The mmap2 syscall uses 4k units, regardless of the actual page, size on
+  // almost every architecture. If porting to a new architecture (Openrisc,
+  // hexagon?), please confirm this code is correct.
+  constexpr off_t MMAP2_FACTOR = 4096;
+  offset /= MMAP2_FACTOR;
   long syscall_number = SYS_mmap2;
 #elif defined(SYS_mmap)
   long syscall_number = SYS_mmap;

From c9e71400df2b84a0bb4e94eb9547ebfdcd41ac39 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray@arm.com>
Date: Thu, 14 May 2026 11:59:56 +0100
Subject: [PATCH 60/95] [AArch64][clang] Improve -mcpu= and -mtune= error
 messages too (#197640)

Similar to my previous change improving the error message for
`-march=` in #197441, this changes `-mcpu=` and `-mtune=` arguments
to only report the first invalid feature flag, rather than the
entire string.

This is a much clearer error message for the user.
---
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp | 60 +++++++++++---------
 clang/test/Driver/aarch64-march.c            | 13 +++++
 2 files changed, 47 insertions(+), 26 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index f42465da14a71..7ed4002e53420 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -137,10 +137,9 @@ aarch64::getAArch64TargetTuneCPU(const llvm::opt::ArgList &Args,
 }
 
 // Decode AArch64 features from string like +[no]featureA+[no]featureB+...
-static bool
-DecodeAArch64Features(const Driver &D, StringRef text,
-                      llvm::AArch64::ExtensionSet &Extensions,
-                      std::optional<std::string> *InvalidArg = nullptr) {
+static bool DecodeAArch64Features(const Driver &D, StringRef text,
+                                  llvm::AArch64::ExtensionSet &Extensions,
+                                  std::optional<std::string> &InvalidArg) {
   SmallVector<StringRef, 8> Split;
   text.split(Split, StringRef("+"), -1, false);
 
@@ -150,8 +149,7 @@ DecodeAArch64Features(const Driver &D, StringRef text,
       continue;
     }
     if (!Extensions.parseModifier(Feature)) {
-      if (InvalidArg)
-        InvalidArg->emplace(("+" + Feature).str());
+      InvalidArg.emplace(("+" + Feature).str());
       return false;
     }
   }
@@ -180,7 +178,8 @@ static bool DecodeAArch64HostFeatures(llvm::AArch64::ExtensionSet &Extensions) {
 // Check if the CPU name and feature modifiers in -mcpu are legal. If yes,
 // decode CPU and feature.
 static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
-                              llvm::AArch64::ExtensionSet &Extensions) {
+                              llvm::AArch64::ExtensionSet &Extensions,
+                              std::optional<std::string> &InvalidArg) {
   std::pair<StringRef, StringRef> Split = Mcpu.split("+");
   StringRef CPU = Split.first;
   const bool IsNative = CPU == "native";
@@ -190,8 +189,10 @@ static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
 
   const std::optional<llvm::AArch64::CpuInfo> CpuInfo =
       llvm::AArch64::parseCpu(CPU);
-  if (!CpuInfo)
+  if (!CpuInfo) {
+    InvalidArg.emplace(Split.first.str());
     return false;
+  }
 
   Extensions.addCPUDefaults(*CpuInfo);
 
@@ -199,7 +200,7 @@ static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
     return false;
 
   if (Split.second.size() &&
-      !DecodeAArch64Features(D, Split.second, Extensions))
+      !DecodeAArch64Features(D, Split.second, Extensions, InvalidArg))
     return false;
 
   return true;
@@ -214,7 +215,7 @@ getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
   std::pair<StringRef, StringRef> Split = StringRef(MarchLowerCase).split("+");
 
   if (Split.first == "native")
-    return DecodeAArch64Mcpu(D, MarchLowerCase, Extensions);
+    return DecodeAArch64Mcpu(D, MarchLowerCase, Extensions, InvalidArg);
 
   const llvm::AArch64::ArchInfo *ArchInfo =
       llvm::AArch64::parseArch(Split.first);
@@ -226,7 +227,7 @@ getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
   Extensions.addArchDefaults(*ArchInfo);
 
   if ((Split.second.size() &&
-       !DecodeAArch64Features(D, Split.second, Extensions, &InvalidArg)))
+       !DecodeAArch64Features(D, Split.second, Extensions, InvalidArg)))
     return false;
 
   return true;
@@ -235,23 +236,27 @@ getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
 static bool
 getAArch64ArchFeaturesFromMcpu(const Driver &D, StringRef Mcpu,
                                const ArgList &Args,
-                               llvm::AArch64::ExtensionSet &Extensions) {
+                               llvm::AArch64::ExtensionSet &Extensions,
+                               std::optional<std::string> &InvalidArg) {
   std::string McpuLowerCase = Mcpu.lower();
-  return DecodeAArch64Mcpu(D, McpuLowerCase, Extensions);
+  return DecodeAArch64Mcpu(D, McpuLowerCase, Extensions, InvalidArg);
 }
 
-static bool getAArch64MicroArchFeaturesFromMtune(const Driver &D,
-                                                 StringRef Mtune,
-                                                 const ArgList &Args) {
+static bool
+getAArch64MicroArchFeaturesFromMtune(const Driver &D, StringRef Mtune,
+                                     const ArgList &Args,
+                                     std::optional<std::string> &InvalidArg) {
   // Check CPU name is valid, but ignore any extensions on it.
   std::string MtuneLowerCase = Mtune.lower();
   llvm::AArch64::ExtensionSet Extensions;
-  return DecodeAArch64Mcpu(D, MtuneLowerCase, Extensions);
+  return DecodeAArch64Mcpu(D, MtuneLowerCase, Extensions, InvalidArg);
 }
 
-static bool getAArch64MicroArchFeaturesFromMcpu(const Driver &D, StringRef Mcpu,
-                                                const ArgList &Args) {
-  return getAArch64MicroArchFeaturesFromMtune(D, Mcpu, Args);
+static bool
+getAArch64MicroArchFeaturesFromMcpu(const Driver &D, StringRef Mcpu,
+                                    const ArgList &Args,
+                                    std::optional<std::string> &InvalidArg) {
+  return getAArch64MicroArchFeaturesFromMtune(D, Mcpu, Args, InvalidArg);
 }
 
 void aarch64::getAArch64TargetFeatures(const Driver &D,
@@ -280,23 +285,26 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     success = getAArch64ArchFeaturesFromMarch(D, A->getValue(), Args,
                                               Extensions, InvalidArg);
   else if ((A = Args.getLastArg(options::OPT_mcpu_EQ)))
-    success =
-        getAArch64ArchFeaturesFromMcpu(D, A->getValue(), Args, Extensions);
+    success = getAArch64ArchFeaturesFromMcpu(D, A->getValue(), Args, Extensions,
+                                             InvalidArg);
   else if (isCPUDeterminedByTriple(Triple))
     success = getAArch64ArchFeaturesFromMcpu(
-        D, getAArch64TargetCPUByTriple(Triple), Args, Extensions);
+        D, getAArch64TargetCPUByTriple(Triple), Args, Extensions, InvalidArg);
   else
     // Default to 'A' profile if the architecture is not specified.
     success = getAArch64ArchFeaturesFromMarch(D, "armv8-a", Args, Extensions,
                                               InvalidArg);
 
   if (success && (A = Args.getLastArg(options::OPT_mtune_EQ)))
-    success = getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args);
+    success = getAArch64MicroArchFeaturesFromMtune(D, A->getValue(), Args,
+                                                   InvalidArg);
   else if (success && (A = Args.getLastArg(options::OPT_mcpu_EQ)))
-    success = getAArch64MicroArchFeaturesFromMcpu(D, A->getValue(), Args);
+    success =
+        getAArch64MicroArchFeaturesFromMcpu(D, A->getValue(), Args, InvalidArg);
   else if (success) {
     if (auto TuneCPU = getAArch64TargetTuneCPUByTriple(Triple))
-      success = getAArch64MicroArchFeaturesFromMtune(D, *TuneCPU, Args);
+      success =
+          getAArch64MicroArchFeaturesFromMtune(D, *TuneCPU, Args, InvalidArg);
   }
 
   if (!success) {
diff --git a/clang/test/Driver/aarch64-march.c b/clang/test/Driver/aarch64-march.c
index d2cea0e50def3..5a0dba3a7e7b7 100644
--- a/clang/test/Driver/aarch64-march.c
+++ b/clang/test/Driver/aarch64-march.c
@@ -36,3 +36,16 @@
 // INVALID-EXT: error: unsupported argument '+badfeature' to option '-march='
 // INVALID-EXT-NOT: armv9.6-a+sme2+sme2p1+sve2+sve2p1+badfeature+aes+sha2+memtag+bf16
 // INVALID-ARCH: error: unsupported argument 'notanarch' to option '-march='
+
+// ================== Check whether -mcpu diagnoses the first invalid argument.
+// RUN: not %clang --target=aarch64 -mcpu=generic+sme2+badfeature+aes %s -### -c 2>&1 | FileCheck -check-prefix=INVALID-MCPU-EXT %s
+// RUN: not %clang --target=aarch64 -mcpu=notacpu+sme2 %s -### -c 2>&1 | FileCheck -check-prefix=INVALID-MCPU %s
+// INVALID-MCPU-EXT: error: unsupported argument '+badfeature' to option '-mcpu='
+// INVALID-MCPU-EXT-NOT: generic+sme2+badfeature+aes
+// INVALID-MCPU: error: unsupported argument 'notacpu' to option '-mcpu='
+
+// ================== Check whether -mtune diagnoses the first invalid argument.
+// RUN: not %clang --target=aarch64 -mtune=generic+sme2+badfeature+aes %s -### -c 2>&1 | FileCheck -check-prefix=INVALID-MTUNE-EXT %s
+// RUN: not %clang --target=aarch64 -mtune=notacpu+sme2 %s -### -c 2>&1 | FileCheck -check-prefix=INVALID-MTUNE %s
+// INVALID-MTUNE-EXT: error: unsupported argument '+badfeature' to option '-mtune='
+// INVALID-MTUNE: error: unsupported argument 'notacpu' to option '-mtune='

From 8257855f77d55895619a9e67184f2de6df79931f Mon Sep 17 00:00:00 2001
From: Jeff Bailey <jbailey@raspberryginger.com>
Date: Thu, 14 May 2026 11:05:55 +0000
Subject: [PATCH 61/95] [libc] Remove sysconf from Scudo integration test
 entrypoints (#197639)

sysconf moved behind LLVM_LIBC_ENABLE_EXPERIMENTAL_ENTRYPOINTS in commit
8146920541c4. When that flag is OFF (the default), the target is not
built as an OBJECT library, so referencing it via add_entrypoint_library
causes a CMake generation error on the libc-x86_64-debian-fullbuild bot.

Scudo does not use sysconf, so this is a safe removal.

Assisted-by: Automated tooling, human reviewed.
---
 libc/test/integration/scudo/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libc/test/integration/scudo/CMakeLists.txt b/libc/test/integration/scudo/CMakeLists.txt
index b4011e501b96b..9f2856d424567 100644
--- a/libc/test/integration/scudo/CMakeLists.txt
+++ b/libc/test/integration/scudo/CMakeLists.txt
@@ -28,7 +28,6 @@ add_entrypoint_library(
     libc.src.unistd.__llvm_libc_syscall
     libc.src.unistd.close
     libc.src.unistd.read
-    libc.src.unistd.sysconf
     libc.src.unistd.write
 )
 

From 9a8fd8cc3bb7b2dc91a324b34e3ebb76c691b195 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 14 May 2026 13:24:56 +0200
Subject: [PATCH 62/95] [libc] Implement the linux-specific memfd_create
 syscall wrapper (#197439)

I'm using the MFD constants from the kernel header as that's what was
done for mmap, though it would be relatively simple to declare these
ourselves, as they are not architecture-dependent.
---
 libc/config/linux/aarch64/entrypoints.txt     |  1 +
 libc/config/linux/arm/entrypoints.txt         |  1 +
 libc/config/linux/riscv/entrypoints.txt       |  1 +
 libc/config/linux/x86_64/entrypoints.txt      |  1 +
 .../llvm-libc-macros/sys-mman-macros.h        |  4 ++
 libc/include/sys/mman.yaml                    |  7 +++
 .../linux/syscall_wrappers/CMakeLists.txt     | 12 +++++
 .../linux/syscall_wrappers/memfd_create.h     | 36 ++++++++++++++
 libc/src/sys/mman/CMakeLists.txt              |  7 +++
 libc/src/sys/mman/linux/CMakeLists.txt        | 11 +++++
 libc/src/sys/mman/linux/memfd_create.cpp      | 31 ++++++++++++
 libc/src/sys/mman/memfd_create.h              | 25 ++++++++++
 libc/test/src/sys/mman/linux/CMakeLists.txt   | 21 ++++++++
 .../src/sys/mman/linux/memfd_create_test.cpp  | 48 +++++++++++++++++++
 14 files changed, 206 insertions(+)
 create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/memfd_create.h
 create mode 100644 libc/src/sys/mman/linux/memfd_create.cpp
 create mode 100644 libc/src/sys/mman/memfd_create.h
 create mode 100644 libc/test/src/sys/mman/linux/memfd_create_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index e61b127e42102..29ce95c65ac87 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -256,6 +256,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
+    libc.src.sys.mman.memfd_create
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.mlock2
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 1c4dd1a4cb879..906f36d45e337 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -177,6 +177,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.ioctl.ioctl
 
     # sys/mman.h entrypoints
+    libc.src.sys.mman.memfd_create
     libc.src.sys.mman.mmap
     libc.src.sys.mman.munmap
 
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 7a34cc5fba201..4eb436c0bbb83 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -256,6 +256,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
+    libc.src.sys.mman.memfd_create
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.mlock2
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 00c94e1e9b5a0..129e2a467af7c 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -262,6 +262,7 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # sys/mman.h entrypoints
     libc.src.sys.mman.madvise
+    libc.src.sys.mman.memfd_create
     libc.src.sys.mman.mincore
     libc.src.sys.mman.mlock
     libc.src.sys.mman.mlock2
diff --git a/libc/include/llvm-libc-macros/sys-mman-macros.h b/libc/include/llvm-libc-macros/sys-mman-macros.h
index a6dc6d96b5b79..bc9f783d84a40 100644
--- a/libc/include/llvm-libc-macros/sys-mman-macros.h
+++ b/libc/include/llvm-libc-macros/sys-mman-macros.h
@@ -18,6 +18,10 @@
 #error "cannot use <sys/mman.h> without proper system headers."
 #endif
 
+#if __has_include(<linux/memfd.h>)
+#include <linux/memfd.h>
+#endif
+
 // Some posix standard flags may not be defined in system headers.
 // Posix mmap flags.
 #ifndef MAP_FAILED
diff --git a/libc/include/sys/mman.yaml b/libc/include/sys/mman.yaml
index 6adc2c2d5bfa6..f14a90436b791 100644
--- a/libc/include/sys/mman.yaml
+++ b/libc/include/sys/mman.yaml
@@ -19,6 +19,13 @@ functions:
       - type: void *
       - type: size_t
       - type: int
+  - name: memfd_create
+    standards:
+      - Linux
+    return_type: int
+    arguments:
+      - type: const char *
+      - type: unsigned int
   - name: mincore
     standards:
       - Linux
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt b/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt
index bbe76fece3bdd..8d0acca011bec 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt
@@ -198,6 +198,18 @@ add_header_library(
     libc.include.sys_syscall
 )
 
+add_header_library(
+  memfd_create
+  HDRS
+    memfd_create.h
+  DEPENDS
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.macros.config
+    libc.include.sys_syscall
+)
+
 add_header_library(
   mmap
   HDRS
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/memfd_create.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/memfd_create.h
new file mode 100644
index 0000000000000..e748c3c6fb80d
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/memfd_create.h
@@ -0,0 +1,36 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// ErrorOr-returning syscall wrapper for memfd_create.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_MEMFD_CREATE_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_MEMFD_CREATE_H
+
+#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers
+
+namespace LIBC_NAMESPACE_DECL {
+namespace linux_syscalls {
+
+LIBC_INLINE ErrorOr<int> memfd_create(const char *name, unsigned int flags) {
+  int ret = syscall_impl<int>(SYS_memfd_create, name, flags);
+  if (ret < 0)
+    return Error(-ret);
+  return ret;
+}
+
+} // namespace linux_syscalls
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_MEMFD_CREATE_H
diff --git a/libc/src/sys/mman/CMakeLists.txt b/libc/src/sys/mman/CMakeLists.txt
index c7be1eddacb5e..ca18fe36449da 100644
--- a/libc/src/sys/mman/CMakeLists.txt
+++ b/libc/src/sys/mman/CMakeLists.txt
@@ -9,6 +9,13 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.madvise
 )
 
+add_entrypoint_object(
+  memfd_create
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.memfd_create
+)
+
 add_entrypoint_object(
   mmap
   ALIAS
diff --git a/libc/src/sys/mman/linux/CMakeLists.txt b/libc/src/sys/mman/linux/CMakeLists.txt
index 6cafa9ba45c1a..e8d9ed384c9b0 100644
--- a/libc/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/src/sys/mman/linux/CMakeLists.txt
@@ -18,6 +18,17 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  memfd_create
+  SRCS
+    memfd_create.cpp
+  HDRS
+    ../memfd_create.h
+  DEPENDS
+    libc.src.__support.OSUtil.linux.syscall_wrappers.memfd_create
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   mmap
   SRCS
diff --git a/libc/src/sys/mman/linux/memfd_create.cpp b/libc/src/sys/mman/linux/memfd_create.cpp
new file mode 100644
index 0000000000000..e3e1e00034e1a
--- /dev/null
+++ b/libc/src/sys/mman/linux/memfd_create.cpp
@@ -0,0 +1,31 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Linux implementation of the memfd_create function.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/mman/memfd_create.h"
+#include "src/__support/OSUtil/linux/syscall_wrappers/memfd_create.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, memfd_create, (const char *name, unsigned int flags)) {
+  ErrorOr<int> result = linux_syscalls::memfd_create(name, flags);
+  if (!result) {
+    libc_errno = result.error();
+    return -1;
+  }
+  return result.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/mman/memfd_create.h b/libc/src/sys/mman/memfd_create.h
new file mode 100644
index 0000000000000..eefdae3e29d7e
--- /dev/null
+++ b/libc/src/sys/mman/memfd_create.h
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation header for memfd_create function.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_MMAN_MEMFD_CREATE_H
+#define LLVM_LIBC_SRC_SYS_MMAN_MEMFD_CREATE_H
+
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int memfd_create(const char *name, unsigned int flags);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SYS_MMAN_MEMFD_CREATE_H
diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt
index b73212d0c322f..29d97cbfe7018 100644
--- a/libc/test/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/test/src/sys/mman/linux/CMakeLists.txt
@@ -229,3 +229,24 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
+
+add_libc_unittest(
+  memfd_create_test
+  SUITE
+    libc_sys_mman_unittests
+  SRCS
+    memfd_create_test.cpp
+  DEPENDS
+    libc.hdr.fcntl_macros
+    libc.hdr.sys_mman_macros
+    libc.include.llvm-libc-macros.file_seek_macros
+    libc.include.sys_mman
+    libc.src.__support.CPP.scope
+    libc.src.errno.errno
+    libc.src.fcntl.fcntl
+    libc.src.sys.mman.memfd_create
+    libc.src.unistd.close
+    libc.src.unistd.lseek
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
diff --git a/libc/test/src/sys/mman/linux/memfd_create_test.cpp b/libc/test/src/sys/mman/linux/memfd_create_test.cpp
new file mode 100644
index 0000000000000..13c1c920e4009
--- /dev/null
+++ b/libc/test/src/sys/mman/linux/memfd_create_test.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Unittests for memfd_create.
+///
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fcntl_macros.h"
+#include "hdr/sys_mman_macros.h"
+#include "include/llvm-libc-macros/file-seek-macros.h"
+#include "src/__support/CPP/scope.h"
+#include "src/fcntl/fcntl.h"
+#include "src/sys/mman/memfd_create.h"
+#include "src/unistd/close.h"
+#include "src/unistd/lseek.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+using LlvmLibcMemfdCreateTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+
+TEST_F(LlvmLibcMemfdCreateTest, Basic) {
+  int fd;
+  ASSERT_THAT(fd = LIBC_NAMESPACE::memfd_create("test_memfd", MFD_CLOEXEC),
+              returns(GE(0)).with_errno(EQ(0)));
+  LIBC_NAMESPACE::cpp::scope_exit close_fd(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::close(fd), Succeeds(0)); });
+
+  int flag;
+  ASSERT_THAT(flag = LIBC_NAMESPACE::fcntl(fd, F_GETFD),
+              returns(GE(0)).with_errno(EQ(0)));
+  EXPECT_NE(flag & FD_CLOEXEC, 0);
+
+  ASSERT_THAT(LIBC_NAMESPACE::lseek(fd, 0, SEEK_END), Succeeds(off_t(0)));
+}
+
+TEST_F(LlvmLibcMemfdCreateTest, ErrorHandling) {
+  // Passing invalid flags should cause EINVAL
+  ASSERT_THAT(LIBC_NAMESPACE::memfd_create("test_memfd", 0x80000000),
+              Fails(EINVAL));
+}

From 11faa7fd9506f6bbc5d51b5620111cbaa4e9935b Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 14 May 2026 13:41:26 +0200
Subject: [PATCH 63/95] [libc] Implement getsockname and getpeername (#197196)

This patch implements getsockname and getpeername functions for Linux,
and adds unit tests to verify them under connected, unbound, and invalid
socket states.

The implementations do not have the socketcall fallbacks, as those are
being removed in #197189.
---
 libc/config/linux/aarch64/entrypoints.txt     |   2 +
 libc/config/linux/riscv/entrypoints.txt       |   2 +
 libc/config/linux/x86_64/entrypoints.txt      |   2 +
 libc/include/sys/socket.yaml                  |  16 ++
 .../linux/syscall_wrappers/CMakeLists.txt     |  28 ++++
 .../linux/syscall_wrappers/getpeername.h      |  39 +++++
 .../linux/syscall_wrappers/getsockname.h      |  39 +++++
 libc/src/sys/socket/CMakeLists.txt            |  14 ++
 libc/src/sys/socket/getpeername.h             |  28 ++++
 libc/src/sys/socket/getsockname.h             |  28 ++++
 libc/src/sys/socket/linux/CMakeLists.txt      |  30 ++++
 libc/src/sys/socket/linux/getpeername.cpp     |  35 ++++
 libc/src/sys/socket/linux/getsockname.cpp     |  35 ++++
 libc/test/src/sys/socket/linux/CMakeLists.txt |  26 +++
 .../src/sys/socket/linux/sockname_test.cpp    | 156 ++++++++++++++++++
 15 files changed, 480 insertions(+)
 create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/getpeername.h
 create mode 100644 libc/src/__support/OSUtil/linux/syscall_wrappers/getsockname.h
 create mode 100644 libc/src/sys/socket/getpeername.h
 create mode 100644 libc/src/sys/socket/getsockname.h
 create mode 100644 libc/src/sys/socket/linux/getpeername.cpp
 create mode 100644 libc/src/sys/socket/linux/getsockname.cpp
 create mode 100644 libc/test/src/sys/socket/linux/sockname_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 29ce95c65ac87..b7c9cabd934b4 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -288,6 +288,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.socket.accept4
     libc.src.sys.socket.bind
     libc.src.sys.socket.connect
+    libc.src.sys.socket.getpeername
+    libc.src.sys.socket.getsockname
     libc.src.sys.socket.getsockopt
     libc.src.sys.socket.listen
     libc.src.sys.socket.recv
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 4eb436c0bbb83..c0adf2fb116aa 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -288,6 +288,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.socket.accept4
     libc.src.sys.socket.bind
     libc.src.sys.socket.connect
+    libc.src.sys.socket.getpeername
+    libc.src.sys.socket.getsockname
     libc.src.sys.socket.getsockopt
     libc.src.sys.socket.listen
     libc.src.sys.socket.recv
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 129e2a467af7c..9970f079abc08 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -304,6 +304,8 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.socket.accept4
     libc.src.sys.socket.bind
     libc.src.sys.socket.connect
+    libc.src.sys.socket.getpeername
+    libc.src.sys.socket.getsockname
     libc.src.sys.socket.getsockopt
     libc.src.sys.socket.listen
     libc.src.sys.socket.recv
diff --git a/libc/include/sys/socket.yaml b/libc/include/sys/socket.yaml
index b1a1a2adb7ebe..e488bb9e43353 100644
--- a/libc/include/sys/socket.yaml
+++ b/libc/include/sys/socket.yaml
@@ -51,6 +51,22 @@ functions:
       - type: int
       - type: const struct sockaddr *
       - type: socklen_t
+  - name: getpeername
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct sockaddr *__restrict
+      - type: socklen_t *__restrict
+  - name: getsockname
+    standards:
+      - POSIX
+    return_type: int
+    arguments:
+      - type: int
+      - type: struct sockaddr *__restrict
+      - type: socklen_t *__restrict
   - name: getsockopt
     standards:
       - POSIX
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt b/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt
index 8d0acca011bec..13fbcc1de2aaa 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/CMakeLists.txt
@@ -94,6 +94,34 @@ add_header_library(
     libc.include.sys_syscall
 )
 
+add_header_library(
+  getpeername
+  HDRS
+    getpeername.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.macros.config
+    libc.hdr.types.socklen_t
+    libc.hdr.types.struct_sockaddr
+    libc.include.sys_syscall
+)
+
+add_header_library(
+  getsockname
+  HDRS
+    getsockname.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.error_or
+    libc.src.__support.OSUtil.osutil
+    libc.src.__support.macros.config
+    libc.hdr.types.socklen_t
+    libc.hdr.types.struct_sockaddr
+    libc.include.sys_syscall
+)
+
 add_header_library(
   listen
   HDRS
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/getpeername.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/getpeername.h
new file mode 100644
index 0000000000000..20c51d51da946
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/getpeername.h
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Syscall wrapper for getpeername.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_GETPEERNAME_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_GETPEERNAME_H
+
+#include "hdr/types/socklen_t.h"
+#include "hdr/types/struct_sockaddr.h"
+#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers
+
+namespace LIBC_NAMESPACE_DECL {
+namespace linux_syscalls {
+
+LIBC_INLINE ErrorOr<int> getpeername(int sockfd, struct sockaddr *addr,
+                                     socklen_t *addrlen) {
+  int ret = syscall_impl<int>(SYS_getpeername, sockfd, addr, addrlen);
+  if (ret < 0)
+    return Error(-static_cast<int>(ret));
+  return ret;
+}
+
+} // namespace linux_syscalls
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_GETPEERNAME_H
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockname.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockname.h
new file mode 100644
index 0000000000000..6069fc58c5816
--- /dev/null
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockname.h
@@ -0,0 +1,39 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Syscall wrapper for getsockname.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_GETSOCKNAME_H
+#define LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_GETSOCKNAME_H
+
+#include "hdr/types/socklen_t.h"
+#include "hdr/types/struct_sockaddr.h"
+#include "src/__support/OSUtil/linux/syscall.h" // syscall_impl
+#include "src/__support/common.h"
+#include "src/__support/error_or.h"
+#include "src/__support/macros/config.h"
+#include <sys/syscall.h> // For syscall numbers
+
+namespace LIBC_NAMESPACE_DECL {
+namespace linux_syscalls {
+
+LIBC_INLINE ErrorOr<int> getsockname(int sockfd, struct sockaddr *addr,
+                                     socklen_t *addrlen) {
+  int ret = syscall_impl<int>(SYS_getsockname, sockfd, addr, addrlen);
+  if (ret < 0)
+    return Error(-static_cast<int>(ret));
+  return ret;
+}
+
+} // namespace linux_syscalls
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_OSUTIL_SYSCALL_WRAPPERS_GETSOCKNAME_H
diff --git a/libc/src/sys/socket/CMakeLists.txt b/libc/src/sys/socket/CMakeLists.txt
index b602ab99c8a9d..a100e90fe3f2d 100644
--- a/libc/src/sys/socket/CMakeLists.txt
+++ b/libc/src/sys/socket/CMakeLists.txt
@@ -37,6 +37,20 @@ add_entrypoint_object(
     .${LIBC_TARGET_OS}.connect
 )
 
+add_entrypoint_object(
+  getpeername
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getpeername
+)
+
+add_entrypoint_object(
+  getsockname
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getsockname
+)
+
 add_entrypoint_object(
   getsockopt
   ALIAS
diff --git a/libc/src/sys/socket/getpeername.h b/libc/src/sys/socket/getpeername.h
new file mode 100644
index 0000000000000..4e7fddd738337
--- /dev/null
+++ b/libc/src/sys/socket/getpeername.h
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation header for getpeername.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_SOCKET_GETPEERNAME_H
+#define LLVM_LIBC_SRC_SYS_SOCKET_GETPEERNAME_H
+
+#include "hdr/types/socklen_t.h"
+#include "hdr/types/struct_sockaddr.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int getpeername(int sockfd, struct sockaddr *__restrict addr,
+                socklen_t *__restrict addrlen);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SYS_SOCKET_GETPEERNAME_H
diff --git a/libc/src/sys/socket/getsockname.h b/libc/src/sys/socket/getsockname.h
new file mode 100644
index 0000000000000..e9520932bec8f
--- /dev/null
+++ b/libc/src/sys/socket/getsockname.h
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Implementation header for getsockname.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_SOCKET_GETSOCKNAME_H
+#define LLVM_LIBC_SRC_SYS_SOCKET_GETSOCKNAME_H
+
+#include "hdr/types/socklen_t.h"
+#include "hdr/types/struct_sockaddr.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+int getsockname(int sockfd, struct sockaddr *__restrict addr,
+                socklen_t *__restrict addrlen);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_SYS_SOCKET_GETSOCKNAME_H
diff --git a/libc/src/sys/socket/linux/CMakeLists.txt b/libc/src/sys/socket/linux/CMakeLists.txt
index e39345410e879..c86991a3e42ea 100644
--- a/libc/src/sys/socket/linux/CMakeLists.txt
+++ b/libc/src/sys/socket/linux/CMakeLists.txt
@@ -66,6 +66,36 @@ add_entrypoint_object(
     libc.src.errno.errno
 )
 
+add_entrypoint_object(
+  getpeername
+  SRCS
+    getpeername.cpp
+  HDRS
+    ../getpeername.h
+  DEPENDS
+    libc.include.sys_socket
+    libc.src.__support.OSUtil.osutil
+    libc.hdr.types.struct_sockaddr
+    libc.hdr.types.socklen_t
+    libc.src.__support.OSUtil.linux.syscall_wrappers.getpeername
+    libc.src.errno.errno
+)
+
+add_entrypoint_object(
+  getsockname
+  SRCS
+    getsockname.cpp
+  HDRS
+    ../getsockname.h
+  DEPENDS
+    libc.include.sys_socket
+    libc.src.__support.OSUtil.osutil
+    libc.hdr.types.struct_sockaddr
+    libc.hdr.types.socklen_t
+    libc.src.__support.OSUtil.linux.syscall_wrappers.getsockname
+    libc.src.errno.errno
+)
+
 add_entrypoint_object(
   getsockopt
   SRCS
diff --git a/libc/src/sys/socket/linux/getpeername.cpp b/libc/src/sys/socket/linux/getpeername.cpp
new file mode 100644
index 0000000000000..8e19f5bbc3f94
--- /dev/null
+++ b/libc/src/sys/socket/linux/getpeername.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Linux implementation of getpeername.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/socket/getpeername.h"
+#include "hdr/types/socklen_t.h"
+#include "hdr/types/struct_sockaddr.h"
+#include "src/__support/OSUtil/linux/syscall_wrappers/getpeername.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, getpeername,
+                   (int sockfd, struct sockaddr *__restrict addr,
+                    socklen_t *__restrict addrlen)) {
+  auto result = linux_syscalls::getpeername(sockfd, addr, addrlen);
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+
+  return result.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/sys/socket/linux/getsockname.cpp b/libc/src/sys/socket/linux/getsockname.cpp
new file mode 100644
index 0000000000000..e5734b307bf71
--- /dev/null
+++ b/libc/src/sys/socket/linux/getsockname.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Linux implementation of getsockname.
+///
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/socket/getsockname.h"
+#include "hdr/types/socklen_t.h"
+#include "hdr/types/struct_sockaddr.h"
+#include "src/__support/OSUtil/linux/syscall_wrappers/getsockname.h"
+#include "src/__support/common.h"
+#include "src/__support/libc_errno.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, getsockname,
+                   (int sockfd, struct sockaddr *__restrict addr,
+                    socklen_t *__restrict addrlen)) {
+  auto result = linux_syscalls::getsockname(sockfd, addr, addrlen);
+  if (!result.has_value()) {
+    libc_errno = result.error();
+    return -1;
+  }
+
+  return result.value();
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/test/src/sys/socket/linux/CMakeLists.txt b/libc/test/src/sys/socket/linux/CMakeLists.txt
index e29140ee06b24..5b10ed2e5df0e 100644
--- a/libc/test/src/sys/socket/linux/CMakeLists.txt
+++ b/libc/test/src/sys/socket/linux/CMakeLists.txt
@@ -138,6 +138,32 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoSetterMatcher
 )
 
+add_libc_unittest(
+  sockname_test
+  SUITE
+    libc_sys_socket_unittests
+  SRCS
+    sockname_test.cpp
+  DEPENDS
+    .socket_test_support
+    libc.include.sys_socket
+    libc.hdr.sys_socket_macros
+    libc.hdr.types.struct_sockaddr_un
+    libc.src.errno.errno
+    libc.src.sys.socket.accept
+    libc.src.sys.socket.bind
+    libc.src.sys.socket.connect
+    libc.src.sys.socket.getpeername
+    libc.src.sys.socket.getsockname
+    libc.src.sys.socket.listen
+    libc.src.sys.socket.socket
+    libc.src.stdio.remove
+    libc.src.unistd.close
+    libc.src.__support.CPP.scope
+    libc.test.UnitTest.ErrnoCheckingTest
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
+
 add_libc_unittest(
   send_recv_test
   SUITE
diff --git a/libc/test/src/sys/socket/linux/sockname_test.cpp b/libc/test/src/sys/socket/linux/sockname_test.cpp
new file mode 100644
index 0000000000000..0ee7b5b75bbeb
--- /dev/null
+++ b/libc/test/src/sys/socket/linux/sockname_test.cpp
@@ -0,0 +1,156 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Unittests for getsockname and getpeername.
+///
+//===----------------------------------------------------------------------===//
+
+#include "hdr/sys_socket_macros.h"
+#include "hdr/types/struct_sockaddr_un.h"
+#include "src/__support/CPP/scope.h"
+#include "src/stdio/remove.h"
+#include "src/sys/socket/accept.h"
+#include "src/sys/socket/bind.h"
+#include "src/sys/socket/connect.h"
+#include "src/sys/socket/getpeername.h"
+#include "src/sys/socket/getsockname.h"
+#include "src/sys/socket/listen.h"
+#include "src/sys/socket/socket.h"
+#include "src/unistd/close.h"
+#include "test/UnitTest/ErrnoCheckingTest.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/src/sys/socket/linux/socket_test_support.h"
+
+using LIBC_NAMESPACE::testing::make_sockaddr_un;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+using LlvmLibcSockNameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
+using LIBC_NAMESPACE::cpp::scope_exit;
+
+TEST_F(LlvmLibcSockNameTest, GetSockNameAndPeerName) {
+  // 1. Invalid Socket
+  struct sockaddr_un addr;
+  socklen_t addr_len = sizeof(addr);
+  ASSERT_THAT(LIBC_NAMESPACE::getsockname(
+                  -1, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+              Fails(EBADF));
+  ASSERT_THAT(LIBC_NAMESPACE::getpeername(
+                  -1, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+              Fails(EBADF));
+
+  // 2. Unbound Socket
+  int sock = LIBC_NAMESPACE::socket(AF_UNIX, SOCK_STREAM, 0);
+  ASSERT_GE(sock, 0);
+  ASSERT_ERRNO_SUCCESS();
+  scope_exit close_sock(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::close(sock), Succeeds(0)); });
+
+  // getsockname on unbound socket should succeed
+  addr_len = sizeof(addr);
+  ASSERT_THAT(LIBC_NAMESPACE::getsockname(
+                  sock, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+              Succeeds(0));
+  ASSERT_GE(addr_len, static_cast<socklen_t>(sizeof(sa_family_t)));
+  ASSERT_EQ(addr.sun_family, static_cast<sa_family_t>(AF_UNIX));
+
+  // getpeername on unbound/unconnected socket should fail with ENOTCONN
+  addr_len = sizeof(addr);
+  ASSERT_THAT(LIBC_NAMESPACE::getpeername(
+                  sock, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+              Fails(ENOTCONN));
+
+  // 3. Connected Sockets
+  const char *client_file = "getsockname_client.test";
+  const auto client_path = libc_make_test_file_path(client_file);
+  struct sockaddr_un client_addr;
+  ASSERT_TRUE(make_sockaddr_un(client_path, client_addr));
+
+  const char *server_file = "getsockname_server.test";
+  const auto server_path = libc_make_test_file_path(server_file);
+  struct sockaddr_un server_addr;
+  ASSERT_TRUE(make_sockaddr_un(server_path, server_addr));
+
+  int server_sock = LIBC_NAMESPACE::socket(AF_UNIX, SOCK_STREAM, 0);
+  ASSERT_GE(server_sock, 0);
+  ASSERT_ERRNO_SUCCESS();
+  scope_exit close_server_sock(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::close(server_sock), Succeeds(0)); });
+
+  ASSERT_THAT(LIBC_NAMESPACE::bind(
+                  server_sock,
+                  reinterpret_cast<const struct sockaddr *>(&server_addr),
+                  sizeof(struct sockaddr_un)),
+              Succeeds(0));
+  scope_exit remove_server_path(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::remove(server_path), Succeeds(0)); });
+
+  ASSERT_THAT(LIBC_NAMESPACE::listen(server_sock, 1), Succeeds(0));
+
+  int client_sock = LIBC_NAMESPACE::socket(AF_UNIX, SOCK_STREAM, 0);
+  ASSERT_GE(client_sock, 0);
+  ASSERT_ERRNO_SUCCESS();
+  scope_exit close_client_sock(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::close(client_sock), Succeeds(0)); });
+
+  ASSERT_THAT(LIBC_NAMESPACE::bind(
+                  client_sock,
+                  reinterpret_cast<const struct sockaddr *>(&client_addr),
+                  sizeof(struct sockaddr_un)),
+              Succeeds(0));
+  scope_exit remove_client_path(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::remove(client_path), Succeeds(0)); });
+
+  ASSERT_THAT(LIBC_NAMESPACE::connect(
+                  client_sock,
+                  reinterpret_cast<const struct sockaddr *>(&server_addr),
+                  sizeof(struct sockaddr_un)),
+              Succeeds(0));
+
+  int accepted_sock = LIBC_NAMESPACE::accept(server_sock, nullptr, nullptr);
+  ASSERT_GE(accepted_sock, 0);
+  ASSERT_ERRNO_SUCCESS();
+  scope_exit close_accepted_sock(
+      [&] { ASSERT_THAT(LIBC_NAMESPACE::close(accepted_sock), Succeeds(0)); });
+
+  // Test getsockname on client_sock (should be client_path)
+  addr_len = sizeof(addr);
+  ASSERT_THAT(
+      LIBC_NAMESPACE::getsockname(
+          client_sock, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+      Succeeds(0));
+  ASSERT_THAT((LIBC_NAMESPACE::testing::SocketAddress{addr, addr_len}),
+              LIBC_NAMESPACE::testing::MatchesAddress(client_path));
+
+  // Test getpeername on client_sock (should be server_path)
+  addr_len = sizeof(addr);
+  ASSERT_THAT(
+      LIBC_NAMESPACE::getpeername(
+          client_sock, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+      Succeeds(0));
+  ASSERT_THAT((LIBC_NAMESPACE::testing::SocketAddress{addr, addr_len}),
+              LIBC_NAMESPACE::testing::MatchesAddress(server_path));
+
+  // Test getsockname on accepted_sock (should be server_path)
+  addr_len = sizeof(addr);
+  ASSERT_THAT(
+      LIBC_NAMESPACE::getsockname(
+          accepted_sock, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+      Succeeds(0));
+  ASSERT_THAT((LIBC_NAMESPACE::testing::SocketAddress{addr, addr_len}),
+              LIBC_NAMESPACE::testing::MatchesAddress(server_path));
+
+  // Test getpeername on accepted_sock (should be client_path)
+  addr_len = sizeof(addr);
+  ASSERT_THAT(
+      LIBC_NAMESPACE::getpeername(
+          accepted_sock, reinterpret_cast<struct sockaddr *>(&addr), &addr_len),
+      Succeeds(0));
+  ASSERT_THAT((LIBC_NAMESPACE::testing::SocketAddress{addr, addr_len}),
+              LIBC_NAMESPACE::testing::MatchesAddress(client_path));
+}

From a74ac064d23486630180ae75e769ab36f3c70e3e Mon Sep 17 00:00:00 2001
From: Duncan McBain <dmcbain@modular.com>
Date: Thu, 14 May 2026 12:47:02 +0100
Subject: [PATCH 64/95] [lldb] Fix TestBuiltinDebugTrap.py on ARM platform
 (#197502)

The workaround for x86_64 which changed the handling of SIGILL signals
is also required on 32-bit ARM platforms, because __builtin_trap(),
which compiles to a UDF instruction, also generates SIGILL. If we don't
change the handling here, the inferior is killed and the stop reason is
"none".

The failure is here:
https://lab.llvm.org/buildbot/#/builders/18/builds/25948
---
 .../builtin-debugtrap/TestBuiltinDebugTrap.py         | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/functionalities/builtin-debugtrap/TestBuiltinDebugTrap.py b/lldb/test/API/functionalities/builtin-debugtrap/TestBuiltinDebugTrap.py
index 0b1e754a68d0b..159325d2d488f 100644
--- a/lldb/test/API/functionalities/builtin-debugtrap/TestBuiltinDebugTrap.py
+++ b/lldb/test/API/functionalities/builtin-debugtrap/TestBuiltinDebugTrap.py
@@ -54,10 +54,13 @@ def test(self):
         # "global" is now 10.
         self.assertEqual(global_value.GetValueAsUnsigned(), 10)
 
-        # Change the handling of SIGILL on x86-64 Linux - do not pass it
-        # to the inferior, but stop and notify lldb. If we don't do this,
-        # the inferior will receive the SIGILL and be terminated.
-        if self.getArchitecture() == "x86_64" and platform == "linux":
+        # Change the handling of SIGILL on Linux - do not pass it to the
+        # inferior, but stop and notify lldb. If we don't do this, the
+        # inferior will receive the SIGILL and be terminated. On x86_64,
+        # __builtin_trap() emits UD2; on ARM it emits UDF — both generate
+        # SIGILL. AArch64 uses BRK #1 (SIGTRAP, already suppressed by
+        # default), so this is harmless there.
+        if platform == "linux":
             self.runCmd("process handle -p false SIGILL")
 
         # We should be at the same point as before -- cannot advance

From b49a13a910d957a10779adee0a558f59d7f31cb6 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Thu, 14 May 2026 12:54:55 +0100
Subject: [PATCH 65/95] [VPlan] Use llvm.masked.{u,s}{div,rem} for predicated
 division (#191377)

Fixes #129538

To allow speculatively executing lanes that are predicated off in the
original scalar loop, we currently widen divisions by using a "safe
divisor" when we can't prove it's safe otherwise.

We previously tried to optimize these to VP intrinsics which can be
speculated with a mask in #154076, but this was fragile as it relied on
assuming that the safe-divisor lanes wouldn't be read.

#189705 and #191240 added new intrinsics to explicitly represent
semantics where the masked off lanes are poison. This PR teaches the
loop vectorizer to use them.

We can then safely convert these to VP intrinsics in optimizeMasksToEVL
for RISC-V. And on AArch64 we see better codegen after
https://github.com/llvm/llvm-project/pull/191164.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  88 +++---
 .../Transforms/Vectorize/VPRecipeBuilder.h    |   4 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  62 ++++-
 .../AArch64/conditional-branches-cost.ll      | 174 +++++++++---
 .../AArch64/conditional-scalar-assignment.ll  |   3 +-
 .../LoopVectorize/AArch64/invalid-costs.ll    |  54 +++-
 .../LoopVectorize/AArch64/predicated-costs.ll |  43 +--
 .../AArch64/scalable-predicate-instruction.ll |   4 +-
 .../AArch64/sve-predicated-costs.ll           |  41 +--
 .../LoopVectorize/AArch64/sve-tail-folding.ll |   3 +-
 .../Transforms/LoopVectorize/RISCV/divrem.ll  |  57 ++--
 .../LoopVectorize/RISCV/pr154103.ll           |   2 +-
 .../LoopVectorize/RISCV/tail-folding-div.ll   |  21 +-
 ...-order-recurrence-sink-replicate-region.ll |   2 +-
 .../LoopVectorize/VPlan/vplan-printing.ll     |   2 +-
 .../VPlan/vplan-sink-scalars-and-merge.ll     |   2 +-
 .../LoopVectorize/cse-replicate-regions.ll    |   2 +-
 .../Transforms/LoopVectorize/div-exact.ll     | 252 ++++++++++++++++++
 ...find-last-iv-sinkable-expr-tail-folding.ll |   9 +-
 .../LoopVectorize/first-order-recurrence.ll   |   6 +-
 .../LoopVectorize/if-pred-non-void.ll         |   4 +-
 .../LoopVectorize/if-pred-stores.ll           |  13 +-
 .../Transforms/LoopVectorize/induction.ll     |  10 +-
 ...eref-pred-poison-ub-ops-feeding-pointer.ll |   2 +-
 ...r154045-dont-fold-extractelement-livein.ll |   2 +-
 .../LoopVectorize/pr44488-predication.ll      |   2 +-
 .../predicatedinst-loop-invariant.ll          |   3 +-
 .../preserve-dbg-loc-and-loop-metadata.ll     |   4 +-
 .../LoopVectorize/reduction-small-size.ll     |  10 +-
 .../Transforms/LoopVectorize/struct-return.ll |  73 +++--
 .../LoopVectorize/tail-folding-div.ll         |  18 +-
 .../LoopVectorize/vector-to-scalar-cast.ll    |   2 +-
 32 files changed, 706 insertions(+), 268 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/div-exact.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9632e03331411..1fa1ccf9037f7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -364,10 +364,10 @@ cl::opt<bool> llvm::EnableLoopVectorization(
     "vectorize-loops", cl::init(true), cl::Hidden,
     cl::desc("Run the Loop vectorization passes"));
 
-static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
-    "force-widen-divrem-via-safe-divisor", cl::Hidden,
-    cl::desc(
-        "Override cost based safe divisor widening for div/rem instructions"));
+static cl::opt<cl::boolOrDefault>
+    ForceMaskedDivRem("force-widen-divrem-via-masked-intrinsic", cl::Hidden,
+                      cl::desc("Override cost based masked intrinsic widening "
+                               "for div/rem instructions"));
 
 static cl::opt<bool> EnableEarlyExitVectorization(
     "enable-early-exit-vectorization", cl::init(true), cl::Hidden,
@@ -1065,10 +1065,10 @@ class LoopVectorizationCostModel {
   /// lowering should be used for div/rem.  This incorporates an override
   /// option so it is not simply a cost comparison.
   bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
-                                     InstructionCost SafeDivisorCost) const {
-    switch (ForceSafeDivisor) {
+                                     InstructionCost MaskedCost) const {
+    switch (ForceMaskedDivRem) {
     case cl::BOU_UNSET:
-      return ScalarCost < SafeDivisorCost;
+      return ScalarCost < MaskedCost;
     case cl::BOU_TRUE:
       return false;
     case cl::BOU_FALSE:
@@ -1114,7 +1114,7 @@ class LoopVectorizationCostModel {
   /// Return the costs for our two available strategies for lowering a
   /// div/rem operation which requires speculating at least one lane.
   /// First result is for scalarization (will be invalid for scalable
-  /// vectors); second is for the safe-divisor strategy.
+  /// vectors); second is for the masked intrinsic strategy.
   std::pair<InstructionCost, InstructionCost>
   getDivRemSpeculationCost(Instruction *I, ElementCount VF);
 
@@ -2379,11 +2379,11 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
   case Instruction::SDiv:
   case Instruction::SRem:
   case Instruction::URem: {
-    // We have the option to use the safe-divisor idiom to avoid predication.
-    // The cost based decision here will always select safe-divisor for
-    // scalable vectors as scalarization isn't legal.
-    const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
-    return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
+    // We have the option to use the llvm.masked.udiv intrinsics to avoid
+    // predication. The cost based decision here will always select the masked
+    // intrinsics for scalable vectors as scalarization isn't legal.
+    const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
+    return isDivRemScalarWithPredication(ScalarCost, MaskedCost);
   }
   }
 }
@@ -2465,6 +2465,21 @@ uint64_t LoopVectorizationCostModel::getPredBlockCostDivisor(
   return std::round((double)HeaderFreq / BBFreq);
 }
 
+static Intrinsic::ID getMaskedDivRemIntrinsic(unsigned Opcode) {
+  switch (Opcode) {
+  case Instruction::UDiv:
+    return Intrinsic::masked_udiv;
+  case Instruction::SDiv:
+    return Intrinsic::masked_sdiv;
+  case Instruction::URem:
+    return Intrinsic::masked_urem;
+  case Instruction::SRem:
+    return Intrinsic::masked_srem;
+  default:
+    llvm_unreachable("Unexpected opcode");
+  }
+}
+
 std::pair<InstructionCost, InstructionCost>
 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
                                                      ElementCount VF) {
@@ -2506,22 +2521,12 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
         getPredBlockCostDivisor(Config.CostKind, I->getParent());
   }
 
-  InstructionCost SafeDivisorCost = 0;
   auto *VecTy = toVectorTy(I->getType(), VF);
-  // The cost of the select guard to ensure all lanes are well defined
-  // after we speculate above any internal control flow.
-  SafeDivisorCost +=
-      TTI.getCmpSelInstrCost(Instruction::Select, VecTy,
-                             toVectorTy(Type::getInt1Ty(I->getContext()), VF),
-                             CmpInst::BAD_ICMP_PREDICATE, Config.CostKind);
-
-  SmallVector<const Value *, 4> Operands(I->operand_values());
-  SafeDivisorCost += TTI.getArithmeticInstrCost(
-      I->getOpcode(), VecTy, Config.CostKind,
-      {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-      {TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None},
-      Operands, I);
-  return {ScalarizationCost, SafeDivisorCost};
+  auto *MaskTy = toVectorTy(Type::getInt1Ty(I->getContext()), VF);
+  IntrinsicCostAttributes ICA(getMaskedDivRemIntrinsic(I->getOpcode()), VecTy,
+                              {VecTy, VecTy, MaskTy});
+  InstructionCost MaskedCost = TTI.getIntrinsicInstrCost(ICA, Config.CostKind);
+  return {ScalarizationCost, MaskedCost};
 }
 
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
@@ -5268,9 +5273,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::URem:
   case Instruction::SRem:
     if (VF.isVector() && isPredicatedInst(I)) {
-      const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
-      return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
-        ScalarCost : SafeDivisorCost;
+      const auto [ScalarCost, MaskedCost] = getDivRemSpeculationCost(I, VF);
+      return isDivRemScalarWithPredication(ScalarCost, MaskedCost) ? ScalarCost
+                                                                   : MaskedCost;
     }
     // We've proven all lanes safe to speculate, fall through.
     [[fallthrough]];
@@ -6528,7 +6533,7 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
                                                              Range);
 }
 
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
+VPRecipeWithIRFlags *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
   auto *I = VPI->getUnderlyingInstr();
   switch (VPI->getOpcode()) {
   default:
@@ -6536,20 +6541,13 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(VPInstruction *VPI) {
   case Instruction::SDiv:
   case Instruction::UDiv:
   case Instruction::SRem:
-  case Instruction::URem: {
-    // If not provably safe, use a select to form a safe divisor before widening the
-    // div/rem operation itself.  Otherwise fall through to general handling below.
-    if (CM.isPredicatedInst(I)) {
-      SmallVector<VPValue *> Ops(VPI->operandsWithoutMask());
-      VPValue *Mask = VPI->getMask();
-      VPValue *One = Plan.getConstantInt(I->getType(), 1u);
-      auto *SafeRHS =
-          Builder.createSelect(Mask, Ops[1], One, VPI->getDebugLoc());
-      Ops[1] = SafeRHS;
-      return new VPWidenRecipe(*I, Ops, *VPI, *VPI, VPI->getDebugLoc());
-    }
+  case Instruction::URem:
+    // If not provably safe, use a masked intrinsic.
+    if (CM.isPredicatedInst(I))
+      return new VPWidenIntrinsicRecipe(
+          getMaskedDivRemIntrinsic(VPI->getOpcode()), VPI->operands(),
+          I->getType(), {}, {}, VPI->getDebugLoc());
     [[fallthrough]];
-  }
   case Instruction::Add:
   case Instruction::And:
   case Instruction::AShr:
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index a84c77d614673..1a16bd99817d5 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -53,9 +53,9 @@ class VPRecipeBuilder {
   VPSingleDefRecipe *tryToWidenCall(VPInstruction *VPI, VFRange &Range);
 
   /// Check if \p VPI has an opcode that can be widened and return a
-  /// VPWidenRecipe if it can. The function should only be called if the
+  /// widened recipe if it can. The function should only be called if the
   /// cost-model indicates that widening should be performed.
-  VPWidenRecipe *tryToWiden(VPInstruction *VPI);
+  VPRecipeWithIRFlags *tryToWiden(VPInstruction *VPI);
 
 public:
   VPRecipeBuilder(VPlan &Plan, const TargetLibraryInfo *TLI,
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 673355ffb1c96..4acc343bdb60b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1821,6 +1821,22 @@ static void reassociateHeaderMask(VPlan &Plan) {
   }
 }
 
+static std::optional<Instruction::BinaryOps>
+getUnmaskedDivRemOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::masked_udiv:
+    return Instruction::UDiv;
+  case Intrinsic::masked_sdiv:
+    return Instruction::SDiv;
+  case Intrinsic::masked_urem:
+    return Instruction::URem;
+  case Intrinsic::masked_srem:
+    return Instruction::SRem;
+  default:
+    return {};
+  }
+}
+
 static void narrowToSingleScalarRecipes(VPlan &Plan) {
   if (Plan.hasScalarVFOnly())
     return;
@@ -1828,7 +1844,8 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_deep(Plan.getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenGEPRecipe, VPReplicateRecipe,
+               VPWidenIntrinsicRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
@@ -1854,6 +1871,25 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
         continue;
       }
 
+      // Narrow llvm.masked.{u,s}{div,rem} intrinsics with a safe divisor.
+      if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(RepOrWidenR)) {
+        if (!vputils::onlyFirstLaneUsed(IntrR))
+          continue;
+        auto Opc = getUnmaskedDivRemOpcode(IntrR->getVectorIntrinsicID());
+        if (!Opc)
+          continue;
+        VPBuilder Builder(IntrR);
+        VPValue *SafeDivisor = Builder.createSelect(
+            IntrR->getOperand(2), IntrR->getOperand(1),
+            Plan.getConstantInt(IntrR->getResultType(), 1));
+        VPValue *Clone = Builder.createNaryOp(
+            *Opc, {IntrR->getOperand(0), SafeDivisor},
+            VPIRFlags::getDefaultFlags(*Opc), IntrR->getDebugLoc());
+        IntrR->replaceAllUsesWith(Clone);
+        IntrR->eraseFromParent();
+        continue;
+      }
+
       // Skip recipes that aren't single scalars.
       if (!vputils::isSingleScalar(RepOrWidenR))
         continue;
@@ -2863,6 +2899,21 @@ static inline RemoveMask_match<Op0_t, Op1_t> m_RemoveMask(const Op0_t &In,
   return RemoveMask_match<Op0_t, Op1_t>(In, Out);
 }
 
+static std::optional<Intrinsic::ID> getVPDivRemIntrinsic(Intrinsic::ID IntrID) {
+  switch (IntrID) {
+  case Intrinsic::masked_udiv:
+    return Intrinsic::vp_udiv;
+  case Intrinsic::masked_sdiv:
+    return Intrinsic::vp_sdiv;
+  case Intrinsic::masked_urem:
+    return Intrinsic::vp_urem;
+  case Intrinsic::masked_srem:
+    return Intrinsic::vp_srem;
+  default:
+    return std::nullopt;
+  }
+}
+
 /// Try to optimize a \p CurRecipe masked by \p HeaderMask to a corresponding
 /// EVL-based recipe without the header mask. Returns nullptr if no EVL-based
 /// recipe could be created.
@@ -2976,6 +3027,15 @@ static VPRecipeBase *optimizeMaskToEVL(VPValue *HeaderMask,
         Intrinsic::vp_merge, {RHS, Plan->getTrue(), LHS, &EVL},
         TypeInfo.inferScalarType(LHS), {}, {}, DL);
 
+  if (auto *IntrR = dyn_cast<VPWidenIntrinsicRecipe>(&CurRecipe))
+    if (auto VPID = getVPDivRemIntrinsic(IntrR->getVectorIntrinsicID()))
+      if (match(IntrR->getOperand(2), m_RemoveMask(HeaderMask, Mask)))
+        return new VPWidenIntrinsicRecipe(*VPID,
+                                          {IntrR->getOperand(0),
+                                           IntrR->getOperand(1),
+                                           Mask ? Mask : Plan->getTrue(), &EVL},
+                                          IntrR->getResultType(), {}, {}, DL);
+
   return nullptr;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 690a61e3e05c2..463732ac407a9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1163,54 +1163,155 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; DEFAULT-NEXT:    [[C1:%.*]] = ptrtoaddr ptr [[C]] to i64
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 2
-; DEFAULT-NEXT:    [[TMP3:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP2]], i64 8)
+; DEFAULT-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP1]], 1
+; DEFAULT-NEXT:    [[TMP15:%.*]] = shl nuw i64 [[TMP1]], 4
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP33:%.*]] = shl nuw i64 [[TMP29]], 1
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
 ; DEFAULT:       [[VECTOR_MEMCHECK]]:
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
+; DEFAULT-NEXT:    [[TMP37:%.*]] = mul i64 [[TMP5]], 4
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = sub i64 [[C1]], [[A2]]
-; DEFAULT-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP5]]
+; DEFAULT-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP6]], [[TMP37]]
+; DEFAULT-NEXT:    [[TMP41:%.*]] = mul i64 [[TMP5]], 4
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = sub i64 [[C1]], [[B3]]
-; DEFAULT-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP7]], [[TMP5]]
+; DEFAULT-NEXT:    [[DIFF_CHECK4:%.*]] = icmp ult i64 [[TMP7]], [[TMP41]]
 ; DEFAULT-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[DIFF_CHECK]], [[DIFF_CHECK4]]
 ; DEFAULT-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; DEFAULT:       [[VECTOR_PH]]:
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK5:%.*]] = icmp ult i64 [[TMP0]], [[TMP15]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK5]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH1:.*]]
+; DEFAULT:       [[VECTOR_PH1]]:
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 2
-; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP9]]
+; DEFAULT-NEXT:    [[TMP45:%.*]] = shl nuw i64 [[TMP9]], 2
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP45]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[Y]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; DEFAULT:       [[VECTOR_BODY]]:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP49:%.*]] = shl nuw nsw i64 [[TMP9]], 1
+; DEFAULT-NEXT:    [[TMP53:%.*]] = mul nuw nsw i64 [[TMP9]], 3
+; DEFAULT-NEXT:    [[TMP57:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP9]]
+; DEFAULT-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP49]]
+; DEFAULT-NEXT:    [[TMP65:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP53]]
 ; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i8>, ptr [[TMP57]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i8>, ptr [[TMP61]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 4 x i8>, ptr [[TMP65]], align 1
 ; DEFAULT-NEXT:    [[TMP11:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x float>
+; DEFAULT-NEXT:    [[TMP69:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x float>
+; DEFAULT-NEXT:    [[TMP73:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x float>
+; DEFAULT-NEXT:    [[TMP91:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x float>
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP9]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP49]]
+; DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[TMP12]], i64 [[TMP53]]
 ; DEFAULT-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 4 x i8>, ptr [[TMP12]], align 1
-; DEFAULT-NEXT:    [[TMP13:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD5]], zeroinitializer
+; DEFAULT-NEXT:    [[WIDE_LOAD10:%.*]] = load <vscale x 4 x i8>, ptr [[TMP26]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 4 x i8>, ptr [[TMP27]], align 1
+; DEFAULT-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 4 x i8>, ptr [[TMP28]], align 1
+; DEFAULT-NEXT:    [[TMP13:%.*]] = icmp ne <vscale x 4 x i8> [[WIDE_LOAD5]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP30:%.*]] = icmp ne <vscale x 4 x i8> [[WIDE_LOAD10]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP31:%.*]] = icmp ne <vscale x 4 x i8> [[WIDE_LOAD11]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP32:%.*]] = icmp ne <vscale x 4 x i8> [[WIDE_LOAD12]], zeroinitializer
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = xor <vscale x 4 x i8> [[WIDE_LOAD]], splat (i8 1)
-; DEFAULT-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i8> splat (i8 1), <vscale x 4 x i8> [[BROADCAST_SPLAT]]
-; DEFAULT-NEXT:    [[TMP16:%.*]] = udiv <vscale x 4 x i8> [[TMP14]], [[TMP15]]
+; DEFAULT-NEXT:    [[TMP34:%.*]] = xor <vscale x 4 x i8> [[WIDE_LOAD6]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP35:%.*]] = xor <vscale x 4 x i8> [[WIDE_LOAD7]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP36:%.*]] = xor <vscale x 4 x i8> [[WIDE_LOAD8]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i8> @llvm.masked.udiv.nxv4i8(<vscale x 4 x i8> [[TMP14]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP13]])
+; DEFAULT-NEXT:    [[TMP38:%.*]] = call <vscale x 4 x i8> @llvm.masked.udiv.nxv4i8(<vscale x 4 x i8> [[TMP34]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP30]])
+; DEFAULT-NEXT:    [[TMP39:%.*]] = call <vscale x 4 x i8> @llvm.masked.udiv.nxv4i8(<vscale x 4 x i8> [[TMP35]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP31]])
+; DEFAULT-NEXT:    [[TMP40:%.*]] = call <vscale x 4 x i8> @llvm.masked.udiv.nxv4i8(<vscale x 4 x i8> [[TMP36]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP32]])
 ; DEFAULT-NEXT:    [[TMP17:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP16]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP42:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP38]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP43:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP39]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP40]], splat (i8 1)
 ; DEFAULT-NEXT:    [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP17]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
-; DEFAULT-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP18]]
+; DEFAULT-NEXT:    [[TMP46:%.*]] = select <vscale x 4 x i1> [[TMP42]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
+; DEFAULT-NEXT:    [[TMP47:%.*]] = select <vscale x 4 x i1> [[TMP43]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
+; DEFAULT-NEXT:    [[TMP48:%.*]] = select <vscale x 4 x i1> [[TMP44]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
+; DEFAULT-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP13]], <vscale x 4 x i32> [[TMP18]], <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[PREDPHI13:%.*]] = select <vscale x 4 x i1> [[TMP30]], <vscale x 4 x i32> [[TMP46]], <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[PREDPHI14:%.*]] = select <vscale x 4 x i1> [[TMP31]], <vscale x 4 x i32> [[TMP47]], <vscale x 4 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[PREDPHI15:%.*]] = select <vscale x 4 x i1> [[TMP32]], <vscale x 4 x i32> [[TMP48]], <vscale x 4 x i32> zeroinitializer
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; DEFAULT-NEXT:    [[TMP50:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD6]] to <vscale x 4 x i32>
+; DEFAULT-NEXT:    [[TMP51:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD7]] to <vscale x 4 x i32>
+; DEFAULT-NEXT:    [[TMP52:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD8]] to <vscale x 4 x i32>
 ; DEFAULT-NEXT:    [[TMP20:%.*]] = sub <vscale x 4 x i32> [[PREDPHI]], [[TMP19]]
+; DEFAULT-NEXT:    [[TMP54:%.*]] = sub <vscale x 4 x i32> [[PREDPHI13]], [[TMP50]]
+; DEFAULT-NEXT:    [[TMP55:%.*]] = sub <vscale x 4 x i32> [[PREDPHI14]], [[TMP51]]
+; DEFAULT-NEXT:    [[TMP56:%.*]] = sub <vscale x 4 x i32> [[PREDPHI15]], [[TMP52]]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = sitofp <vscale x 4 x i32> [[TMP20]] to <vscale x 4 x float>
+; DEFAULT-NEXT:    [[TMP58:%.*]] = sitofp <vscale x 4 x i32> [[TMP54]] to <vscale x 4 x float>
+; DEFAULT-NEXT:    [[TMP59:%.*]] = sitofp <vscale x 4 x i32> [[TMP55]] to <vscale x 4 x float>
+; DEFAULT-NEXT:    [[TMP60:%.*]] = sitofp <vscale x 4 x i32> [[TMP56]] to <vscale x 4 x float>
 ; DEFAULT-NEXT:    [[TMP22:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP21]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP11]])
+; DEFAULT-NEXT:    [[TMP62:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP58]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP69]])
+; DEFAULT-NEXT:    [[TMP63:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP59]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP73]])
+; DEFAULT-NEXT:    [[TMP64:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP60]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP91]])
 ; DEFAULT-NEXT:    [[TMP23:%.*]] = fptoui <vscale x 4 x float> [[TMP22]] to <vscale x 4 x i8>
+; DEFAULT-NEXT:    [[TMP66:%.*]] = fptoui <vscale x 4 x float> [[TMP62]] to <vscale x 4 x i8>
+; DEFAULT-NEXT:    [[TMP67:%.*]] = fptoui <vscale x 4 x float> [[TMP63]] to <vscale x 4 x i8>
+; DEFAULT-NEXT:    [[TMP68:%.*]] = fptoui <vscale x 4 x float> [[TMP64]] to <vscale x 4 x i8>
 ; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP70:%.*]] = getelementptr i8, ptr [[TMP24]], i64 [[TMP9]]
+; DEFAULT-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr [[TMP24]], i64 [[TMP49]]
+; DEFAULT-NEXT:    [[TMP72:%.*]] = getelementptr i8, ptr [[TMP24]], i64 [[TMP53]]
 ; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP23]], ptr [[TMP24]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP66]], ptr [[TMP70]], align 1
+; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP67]], ptr [[TMP71]], align 1
+; DEFAULT-NEXT:    store <vscale x 4 x i8> [[TMP68]], ptr [[TMP72]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP45]]
 ; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP25]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; DEFAULT:       [[MIDDLE_BLOCK]]:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; DEFAULT:       [[VEC_EPILOG_ITER_CHECK]]:
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP33]]
+; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF30:![0-9]+]]
+; DEFAULT:       [[VEC_EPILOG_PH]]:
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_PH]] ]
+; DEFAULT-NEXT:    [[TMP74:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP75:%.*]] = shl nuw i64 [[TMP74]], 1
+; DEFAULT-NEXT:    [[N_MOD_VF16:%.*]] = urem i64 [[TMP0]], [[TMP75]]
+; DEFAULT-NEXT:    [[N_VEC17:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF16]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT18:%.*]] = insertelement <vscale x 2 x i8> poison, i8 [[Y]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT19:%.*]] = shufflevector <vscale x 2 x i8> [[BROADCAST_SPLATINSERT18]], <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; DEFAULT:       [[VEC_EPILOG_VECTOR_BODY]]:
+; DEFAULT-NEXT:    [[INDEX20:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT24:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP76:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX20]]
+; DEFAULT-NEXT:    [[WIDE_LOAD21:%.*]] = load <vscale x 2 x i8>, ptr [[TMP76]], align 1
+; DEFAULT-NEXT:    [[TMP77:%.*]] = uitofp <vscale x 2 x i8> [[WIDE_LOAD21]] to <vscale x 2 x float>
+; DEFAULT-NEXT:    [[TMP78:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX20]]
+; DEFAULT-NEXT:    [[WIDE_LOAD22:%.*]] = load <vscale x 2 x i8>, ptr [[TMP78]], align 1
+; DEFAULT-NEXT:    [[TMP79:%.*]] = icmp ne <vscale x 2 x i8> [[WIDE_LOAD22]], zeroinitializer
+; DEFAULT-NEXT:    [[TMP80:%.*]] = xor <vscale x 2 x i8> [[WIDE_LOAD21]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP81:%.*]] = call <vscale x 2 x i8> @llvm.masked.udiv.nxv2i8(<vscale x 2 x i8> [[TMP80]], <vscale x 2 x i8> [[BROADCAST_SPLAT19]], <vscale x 2 x i1> [[TMP79]])
+; DEFAULT-NEXT:    [[TMP82:%.*]] = icmp ugt <vscale x 2 x i8> [[TMP81]], splat (i8 1)
+; DEFAULT-NEXT:    [[TMP83:%.*]] = select <vscale x 2 x i1> [[TMP82]], <vscale x 2 x i32> zeroinitializer, <vscale x 2 x i32> splat (i32 255)
+; DEFAULT-NEXT:    [[PREDPHI23:%.*]] = select <vscale x 2 x i1> [[TMP79]], <vscale x 2 x i32> [[TMP83]], <vscale x 2 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP84:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD21]] to <vscale x 2 x i32>
+; DEFAULT-NEXT:    [[TMP85:%.*]] = sub <vscale x 2 x i32> [[PREDPHI23]], [[TMP84]]
+; DEFAULT-NEXT:    [[TMP86:%.*]] = sitofp <vscale x 2 x i32> [[TMP85]] to <vscale x 2 x float>
+; DEFAULT-NEXT:    [[TMP87:%.*]] = call <vscale x 2 x float> @llvm.fmuladd.nxv2f32(<vscale x 2 x float> [[TMP86]], <vscale x 2 x float> splat (float 3.000000e+00), <vscale x 2 x float> [[TMP77]])
+; DEFAULT-NEXT:    [[TMP88:%.*]] = fptoui <vscale x 2 x float> [[TMP87]] to <vscale x 2 x i8>
+; DEFAULT-NEXT:    [[TMP89:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX20]]
+; DEFAULT-NEXT:    store <vscale x 2 x i8> [[TMP88]], ptr [[TMP89]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT24]] = add nuw i64 [[INDEX20]], [[TMP75]]
+; DEFAULT-NEXT:    [[TMP90:%.*]] = icmp eq i64 [[INDEX_NEXT24]], [[N_VEC17]]
+; DEFAULT-NEXT:    br i1 [[TMP90]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]]
+; DEFAULT:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; DEFAULT-NEXT:    [[CMP_N25:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC17]]
+; DEFAULT-NEXT:    br i1 [[CMP_N25]], [[EXIT]], label %[[SCALAR_PH]]
 ; DEFAULT:       [[SCALAR_PH]]:
 ;
 ; PRED-LABEL: define void @pred_udiv_select_cost(
@@ -1223,7 +1324,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; PRED-NEXT:    br label %[[VECTOR_MEMCHECK:.*]]
 ; PRED:       [[VECTOR_MEMCHECK]]:
 ; PRED-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
+; PRED-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
 ; PRED-NEXT:    [[TMP3:%.*]] = sub i64 [[C1]], [[A2]]
 ; PRED-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
 ; PRED-NEXT:    [[TMP4:%.*]] = sub i64 [[C1]], [[B3]]
@@ -1232,37 +1333,36 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
 ; PRED-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; PRED:       [[VECTOR_PH]]:
 ; PRED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[Y]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[Y]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
 ; PRED-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; PRED:       [[VECTOR_BODY]]:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP12]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; PRED-NEXT:    [[TMP13:%.*]] = uitofp <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x float>
+; PRED-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; PRED-NEXT:    [[TMP9:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x float>
 ; PRED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; PRED-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
-; PRED-NEXT:    [[TMP15:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
-; PRED-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
-; PRED-NEXT:    [[TMP17:%.*]] = xor <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
-; PRED-NEXT:    [[TMP18:%.*]] = select <vscale x 16 x i1> [[TMP16]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> splat (i8 1)
-; PRED-NEXT:    [[TMP19:%.*]] = udiv <vscale x 16 x i8> [[TMP17]], [[TMP18]]
-; PRED-NEXT:    [[TMP20:%.*]] = icmp ugt <vscale x 16 x i8> [[TMP19]], splat (i8 1)
-; PRED-NEXT:    [[TMP21:%.*]] = select <vscale x 16 x i1> [[TMP20]], <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> splat (i32 255)
-; PRED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i32> [[TMP21]], <vscale x 16 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
-; PRED-NEXT:    [[TMP23:%.*]] = sub <vscale x 16 x i32> [[PREDPHI]], [[TMP22]]
-; PRED-NEXT:    [[TMP24:%.*]] = sitofp <vscale x 16 x i32> [[TMP23]] to <vscale x 16 x float>
-; PRED-NEXT:    [[TMP25:%.*]] = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> [[TMP24]], <vscale x 16 x float> splat (float 3.000000e+00), <vscale x 16 x float> [[TMP13]])
-; PRED-NEXT:    [[TMP26:%.*]] = fptoui <vscale x 16 x float> [[TMP25]] to <vscale x 16 x i8>
+; PRED-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
+; PRED-NEXT:    [[TMP10:%.*]] = icmp ne <vscale x 4 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
+; PRED-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i1> zeroinitializer
+; PRED-NEXT:    [[TMP21:%.*]] = xor <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
+; PRED-NEXT:    [[TMP13:%.*]] = call <vscale x 4 x i8> @llvm.masked.udiv.nxv4i8(<vscale x 4 x i8> [[TMP21]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP11]])
+; PRED-NEXT:    [[TMP22:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP13]], splat (i8 1)
+; PRED-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[TMP22]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
+; PRED-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP10]], <vscale x 4 x i32> [[TMP15]], <vscale x 4 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP16:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
+; PRED-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[PREDPHI]], [[TMP16]]
+; PRED-NEXT:    [[TMP18:%.*]] = sitofp <vscale x 4 x i32> [[TMP17]] to <vscale x 4 x float>
+; PRED-NEXT:    [[TMP19:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP18]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP9]])
+; PRED-NEXT:    [[TMP20:%.*]] = fptoui <vscale x 4 x float> [[TMP19]] to <vscale x 4 x i8>
 ; PRED-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr align 1 [[TMP27]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP20]], ptr align 1 [[TMP27]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP8]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP0]])
-; PRED-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[TMP0]])
+; PRED-NEXT:    [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; PRED-NEXT:    [[TMP29:%.*]] = xor i1 [[TMP28]], true
 ; PRED-NEXT:    br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; PRED:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
index 9fc9f03461b69..5e771da20198c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-scalar-assignment.ll
@@ -1164,8 +1164,7 @@ define i32 @simple_csa_int_divide(ptr noalias %a, ptr noalias %b, i32 %default_v
 ; SVE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[INDEX]]
 ; SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; SVE-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; SVE-NEXT:    [[TMP7:%.*]] = select <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i32> splat (i32 1)
-; SVE-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 4 x i32> splat (i32 42), [[TMP7]]
+; SVE-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.sdiv.nxv4i32(<vscale x 4 x i32> splat (i32 42), <vscale x 4 x i32> [[WIDE_LOAD]], <vscale x 4 x i1> [[TMP6]])
 ; SVE-NEXT:    [[TMP9:%.*]] = freeze <vscale x 4 x i1> [[TMP6]]
 ; SVE-NEXT:    [[TMP10:%.*]] = call i1 @llvm.vector.reduce.or.nxv4i1(<vscale x 4 x i1> [[TMP9]])
 ; SVE-NEXT:    [[TMP11]] = select i1 [[TMP10]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i1> [[TMP4]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
index de5a24666626c..dfa4680ca9786 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/invalid-costs.ll
@@ -11,33 +11,77 @@ define void @replicate_sdiv_conditional(ptr noalias %a, ptr noalias %b, ptr noal
 ; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 64, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 64, [[TMP11]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 64, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[TMP23:%.*]] = mul nuw nsw i64 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP27]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i32>, ptr [[TMP31]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i32>, ptr [[TMP35]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP39:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD1]], zeroinitializer
+; CHECK-NEXT:    [[TMP43:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD2]], zeroinitializer
+; CHECK-NEXT:    [[TMP47:%.*]] = icmp slt <vscale x 4 x i32> [[WIDE_LOAD3]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP6]], i64 [[TMP23]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP6]], <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP16]], <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP17]], <vscale x 4 x i1> [[TMP43]], <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> [[TMP47]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD4]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP22:%.*]] = sext <vscale x 4 x i32> [[WIDE_MASKED_LOAD6]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP8:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 1)
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD4]], splat (i32 1)
+; CHECK-NEXT:    [[TMP25:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD5]], splat (i32 1)
+; CHECK-NEXT:    [[TMP26:%.*]] = ashr <vscale x 4 x i32> [[WIDE_MASKED_LOAD6]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[TMP8]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 4 x i32> [[TMP24]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add <vscale x 4 x i32> [[TMP25]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 4 x i32> [[TMP26]], [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = sext <vscale x 4 x i32> [[TMP9]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i64> [[TMP7]], <vscale x 4 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = sdiv <vscale x 4 x i64> [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = sext <vscale x 4 x i32> [[TMP28]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP33:%.*]] = sext <vscale x 4 x i32> [[TMP29]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP34:%.*]] = sext <vscale x 4 x i32> [[TMP30]] to <vscale x 4 x i64>
+; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.masked.sdiv.nxv4i64(<vscale x 4 x i64> [[TMP10]], <vscale x 4 x i64> [[TMP7]], <vscale x 4 x i1> [[TMP5]])
+; CHECK-NEXT:    [[TMP36:%.*]] = call <vscale x 4 x i64> @llvm.masked.sdiv.nxv4i64(<vscale x 4 x i64> [[TMP32]], <vscale x 4 x i64> [[TMP20]], <vscale x 4 x i1> [[TMP39]])
+; CHECK-NEXT:    [[TMP37:%.*]] = call <vscale x 4 x i64> @llvm.masked.sdiv.nxv4i64(<vscale x 4 x i64> [[TMP33]], <vscale x 4 x i64> [[TMP21]], <vscale x 4 x i1> [[TMP43]])
+; CHECK-NEXT:    [[TMP38:%.*]] = call <vscale x 4 x i64> @llvm.masked.sdiv.nxv4i64(<vscale x 4 x i64> [[TMP34]], <vscale x 4 x i64> [[TMP22]], <vscale x 4 x i1> [[TMP47]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = trunc <vscale x 4 x i64> [[TMP12]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP40:%.*]] = trunc <vscale x 4 x i64> [[TMP36]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP41:%.*]] = trunc <vscale x 4 x i64> [[TMP37]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP42:%.*]] = trunc <vscale x 4 x i64> [[TMP38]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP5]], <vscale x 4 x i32> [[TMP13]], <vscale x 4 x i32> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[PREDPHI7:%.*]] = select <vscale x 4 x i1> [[TMP39]], <vscale x 4 x i32> [[TMP40]], <vscale x 4 x i32> [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[PREDPHI8:%.*]] = select <vscale x 4 x i1> [[TMP43]], <vscale x 4 x i32> [[TMP41]], <vscale x 4 x i32> [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[PREDPHI9:%.*]] = select <vscale x 4 x i1> [[TMP47]], <vscale x 4 x i32> [[TMP42]], <vscale x 4 x i32> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP19]]
+; CHECK-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP23]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[PREDPHI]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[PREDPHI7]], ptr [[TMP44]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[PREDPHI8]], ptr [[TMP45]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[PREDPHI9]], ptr [[TMP46]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
index 6808eddd5a1c6..8095e40000e9e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/predicated-costs.ll
@@ -236,6 +236,8 @@ define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %e
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[END]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[D_0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[D_1]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
@@ -244,44 +246,10 @@ define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %e
 ; CHECK-NEXT:    [[TMP1:%.*]] = srem <4 x i32> [[TMP0]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[TMP1]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_SDIV_IF:.*]], label %[[PRED_SDIV_CONTINUE:.*]]
-; CHECK:       [[PRED_SDIV_IF]]:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv i32 [[TMP5]], [[D_1]]
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP6]], i64 0
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE]]
-; CHECK:       [[PRED_SDIV_CONTINUE]]:
-; CHECK-NEXT:    [[TMP8:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP7]], %[[PRED_SDIV_IF]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_SDIV_IF1:.*]], label %[[PRED_SDIV_CONTINUE2:.*]]
-; CHECK:       [[PRED_SDIV_IF1]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i32> [[TMP0]], i64 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv i32 [[TMP10]], [[D_1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP11]], i64 1
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE2]]
-; CHECK:       [[PRED_SDIV_CONTINUE2]]:
-; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x i32> [ [[TMP8]], %[[PRED_SDIV_CONTINUE]] ], [ [[TMP12]], %[[PRED_SDIV_IF1]] ]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_SDIV_IF3:.*]], label %[[PRED_SDIV_CONTINUE4:.*]]
-; CHECK:       [[PRED_SDIV_IF3]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[TMP0]], i64 2
-; CHECK-NEXT:    [[TMP16:%.*]] = sdiv i32 [[TMP15]], [[D_1]]
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP16]], i64 2
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE4]]
-; CHECK:       [[PRED_SDIV_CONTINUE4]]:
-; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP13]], %[[PRED_SDIV_CONTINUE2]] ], [ [[TMP17]], %[[PRED_SDIV_IF3]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[PRED_SDIV_IF5:.*]], label %[[PRED_SDIV_CONTINUE6:.*]]
-; CHECK:       [[PRED_SDIV_IF5]]:
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i32> [[TMP0]], i64 3
-; CHECK-NEXT:    [[TMP21:%.*]] = sdiv i32 [[TMP20]], [[D_1]]
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP21]], i64 3
-; CHECK-NEXT:    br label %[[PRED_SDIV_CONTINUE6]]
-; CHECK:       [[PRED_SDIV_CONTINUE6]]:
-; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_SDIV_CONTINUE4]] ], [ [[TMP22]], %[[PRED_SDIV_IF5]] ]
+; CHECK-NEXT:    [[TMP23:%.*]] = call <4 x i32> @llvm.masked.sdiv.v4i32(<4 x i32> [[TMP0]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP24:%.*]] = add <4 x i32> [[TMP23]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <4 x i32> [[TMP24]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x i1> [[TMP3]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i64> [[TMP25]], i64 0
@@ -289,6 +257,7 @@ define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %e
 ; CHECK-NEXT:    store i32 [[INDEX]], ptr [[TMP28]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i1> [[TMP3]], i64 1
 ; CHECK-NEXT:    br i1 [[TMP9]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
 ; CHECK:       [[PRED_STORE_IF7]]:
 ; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i64> [[TMP25]], i64 1
@@ -297,6 +266,7 @@ define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %e
 ; CHECK-NEXT:    store i32 [[TMP33]], ptr [[TMP32]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
 ; CHECK:       [[PRED_STORE_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i1> [[TMP3]], i64 2
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
 ; CHECK:       [[PRED_STORE_IF9]]:
 ; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <4 x i64> [[TMP25]], i64 2
@@ -305,6 +275,7 @@ define void @srem_sdiv_without_tail_folding(i32 %d.0, i32 %d.1, ptr %dst, i32 %e
 ; CHECK-NEXT:    store i32 [[TMP37]], ptr [[TMP36]], align 4
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
 ; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <4 x i1> [[TMP3]], i64 3
 ; CHECK-NEXT:    br i1 [[TMP19]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
 ; CHECK:       [[PRED_STORE_IF11]]:
 ; CHECK-NEXT:    [[TMP39:%.*]] = extractelement <4 x i64> [[TMP25]], i64 3
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
index 9ac66b26a50dc..13af29855be23 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-predicate-instruction.ll
@@ -13,7 +13,7 @@ target triple = "aarch64-unknown-linux-gnu"
 
 define void  @predication_in_loop(ptr %a, ptr %b, ptr %cond) #0 {
 ; CHECK-LABEL: @predication_in_loop
-; CHECK:  sdiv <vscale x 4 x i32>
+; CHECK:  call <vscale x 4 x i32> @llvm.masked.sdiv
 ;
 entry:
   br label %for.body
@@ -58,7 +58,7 @@ for.inc:
 
 define void @unpredicated_loop_predication_through_tailfolding(ptr %a, ptr %b) #0 {
 ; CHECK-LABEL: @unpredicated_loop_predication_through_tailfolding
-; CHECK-NOT:  sdiv <vscale x 4 x i32>
+; CHECK-NOT:  call <vscale x 4 x i32> @llvm.masked.sdiv
 
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll
index cd78aee4c7491..f6f6c3502da99 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-predicated-costs.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
 ; RUN: opt -p loop-vectorize -mtriple=aarch64 -mattr=+sve -S %s | FileCheck %s
 
-; The innermost block then.1 has a 25% chance of being executed according to
+; The innermost block then.1 has a 12.5% chance of being executed according to
 ; BranchProbabilityInfo, but if we vectorize it then we will unconditionally
 ; execute it. Avoid this unprofitable vectorization by taking the nested
 ; probability into account in the cost model.
-define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
+define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: define void @nested(
-; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
@@ -16,6 +16,8 @@ define void @nested(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK:       [[THEN_0]]:
 ; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]]
 ; CHECK:       [[THEN_1]]:
+; CHECK-NEXT:    br i1 [[C2]], label %[[THEN_2:.*]], label %[[LATCH]]
+; CHECK:       [[THEN_2]]:
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr i64, ptr [[P0]], i32 [[X]]
 ; CHECK-NEXT:    [[X1:%.*]] = load i64, ptr [[GEP0]], align 8
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i64, ptr [[P1]], i32 [[X]]
@@ -41,6 +43,9 @@ then.0:
   br i1 %c1, label %then.1, label %latch
 
 then.1:
+  br i1 %c2, label %then.2, label %latch
+
+then.2:
   %gep0 = getelementptr i64, ptr %p0, i32 %iv
   %x = load i64, ptr %gep0
   %gep1 = getelementptr i64, ptr %p1, i32 %iv
@@ -61,9 +66,9 @@ exit:
 ; This is the same CFG as @nested above, but we have provided branch weights
 ; which tell BranchProbabilityInfo that then.1 will always be taken. In this
 ; case, we should vectorize because it is profitable.
-define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
+define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1, i1 %c2) {
 ; CHECK-LABEL: define void @always_taken(
-; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: ptr noalias [[P0:%.*]], ptr noalias [[P1:%.*]], i1 [[C0:%.*]], i1 [[C1:%.*]], i1 [[C2:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 2
@@ -75,29 +80,30 @@ define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i32 [[TMP3]], 1
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP5]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C2]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C1]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i1> poison, i1 [[C0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i1> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = select <vscale x 2 x i1> [[BROADCAST_SPLAT2]], <vscale x 2 x i1> [[BROADCAST_SPLAT]], <vscale x 2 x i1> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> [[BROADCAST_SPLAT1]], <vscale x 2 x i1> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[P0]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP3]] to i64
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i64, ptr [[TMP10]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP20]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP10]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP20]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[P1]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD4:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> poison)
-; CHECK-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD4]], <vscale x 2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP14:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD5]], <vscale x 2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = udiv <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], [[TMP21]]
-; CHECK-NEXT:    [[TMP22:%.*]] = udiv <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], [[TMP14]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP15]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[TMP6]])
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP22]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[TMP6]])
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD7:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr align 8 [[TMP12]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> poison)
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i64> @llvm.masked.udiv.nxv2i64(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD6]], <vscale x 2 x i1> [[TMP8]])
+; CHECK-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @llvm.masked.udiv.nxv2i64(<vscale x 2 x i64> [[WIDE_MASKED_LOAD5]], <vscale x 2 x i64> [[WIDE_MASKED_LOAD7]], <vscale x 2 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP14]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[TMP8]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr align 8 [[TMP12]], <vscale x 2 x i1> [[TMP8]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -113,6 +119,8 @@ define void @always_taken(ptr noalias %p0, ptr noalias %p1, i1 %c0, i1 %c1) {
 ; CHECK:       [[THEN_0]]:
 ; CHECK-NEXT:    br i1 [[C1]], label %[[THEN_1:.*]], label %[[LATCH]], !prof [[PROF3]]
 ; CHECK:       [[THEN_1]]:
+; CHECK-NEXT:    br i1 [[C2]], label %[[THEN_2:.*]], label %[[LATCH]], !prof [[PROF3]]
+; CHECK:       [[THEN_2]]:
 ; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr i64, ptr [[P0]], i32 [[IV1]]
 ; CHECK-NEXT:    [[X:%.*]] = load i64, ptr [[GEP0]], align 8
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i64, ptr [[P1]], i32 [[IV1]]
@@ -138,6 +146,9 @@ then.0:
   br i1 %c1, label %then.1, label %latch, !prof !4
 
 then.1:
+  br i1 %c2, label %then.2, label %latch, !prof !4
+
+then.2:
   %gep0 = getelementptr i64, ptr %p0, i32 %iv
   %x = load i64, ptr %gep0
   %gep1 = getelementptr i64, ptr %p1, i32 %iv
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
index 59e8a9b924d4e..ce934599d3d1e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -462,8 +462,7 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[INDEX1]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP11]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD2:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP15:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i32> splat (i32 1)
-; CHECK-NEXT:    [[TMP16:%.*]] = udiv <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], [[TMP15]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 4 x i32> @llvm.masked.udiv.nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP16]], ptr align 4 [[TMP12]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add i64 [[INDEX1]], [[TMP6]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT3]], i64 [[UMAX]])
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
index 1d6bd6b92c135..16fdc51da55b8 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll
@@ -85,8 +85,7 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP9:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP9]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
@@ -214,8 +213,7 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP10]])
-; CHECK-NEXT:    [[TMP9:%.*]] = srem <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vp.srem.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP9]], ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP10]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP10]] to i64
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP8]], [[INDEX]]
@@ -280,8 +278,7 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP11:%.*]] = udiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.udiv.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[TMP6]], i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP6]], i64 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
@@ -302,14 +299,12 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
-; FIXED-NEXT:    [[TMP5:%.*]] = select i1 [[TMP1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[TMP8:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
+; FIXED-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.masked.udiv.v4i64(<4 x i64> [[WIDE_LOAD1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]])
 ; FIXED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select i1 [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8
@@ -359,8 +354,7 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 2 x i64> [[WIDE_LOAD]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i1> [[TMP6]], i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 2 x i1> [[TMP6]], i64 0
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr align 8 [[TMP8]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP12]])
@@ -381,14 +375,12 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) {
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    [[TMP0:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer
-; FIXED-NEXT:    [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
-; FIXED-NEXT:    [[TMP5:%.*]] = select i1 [[TMP1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i64> splat (i64 1)
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
-; FIXED-NEXT:    [[TMP8:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[TMP5]]
+; FIXED-NEXT:    [[TMP8:%.*]] = call <4 x i64> @llvm.masked.sdiv.v4i64(<4 x i64> [[WIDE_LOAD1]], <4 x i64> [[BROADCAST_SPLAT]], <4 x i1> [[TMP0]])
 ; FIXED-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select i1 [[TMP6]], <4 x i64> [[TMP8]], <4 x i64> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <4 x i64> [[PREDPHI2]], ptr [[TMP2]], align 8
@@ -576,8 +568,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.vp.load.nxv16i8.p0(ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_LOAD]], splat (i8 -128)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i8> splat (i8 1), i32 [[TMP12]])
-; CHECK-NEXT:    [[TMP11:%.*]] = sdiv <vscale x 16 x i8> [[WIDE_LOAD]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 16 x i8> @llvm.vp.sdiv.nxv16i8(<vscale x 16 x i8> [[WIDE_LOAD]], <vscale x 16 x i8> splat (i8 -1), <vscale x 16 x i1> [[TMP9]], i32 [[TMP12]])
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.vp.store.nxv16i8.p0(<vscale x 16 x i8> [[PREDPHI]], ptr align 1 [[TMP7]], <vscale x 16 x i1> splat (i1 true), i32 [[TMP12]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext i32 [[TMP12]] to i64
@@ -600,8 +591,7 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) {
 ; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[INDEX]]
 ; FIXED-NEXT:    [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP1]], align 1
 ; FIXED-NEXT:    [[TMP5:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], splat (i8 -128)
-; FIXED-NEXT:    [[TMP7:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> splat (i8 -1), <32 x i8> splat (i8 1)
-; FIXED-NEXT:    [[TMP9:%.*]] = sdiv <32 x i8> [[WIDE_LOAD1]], [[TMP7]]
+; FIXED-NEXT:    [[TMP9:%.*]] = call <32 x i8> @llvm.masked.sdiv.v32i8(<32 x i8> [[WIDE_LOAD1]], <32 x i8> splat (i8 -1), <32 x i1> [[TMP5]])
 ; FIXED-NEXT:    [[PREDPHI2:%.*]] = select <32 x i1> [[TMP5]], <32 x i8> [[TMP9]], <32 x i8> [[WIDE_LOAD1]]
 ; FIXED-NEXT:    store <32 x i8> [[PREDPHI2]], ptr [[TMP1]], align 1
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
@@ -662,11 +652,9 @@ define void @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c, ptr %p) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT7]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i8> [[BROADCAST_SPLAT2]], <vscale x 4 x i8> splat (i8 1), i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP5:%.*]] = udiv <vscale x 4 x i8> [[VEC_IND]], [[TMP10]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i8> @llvm.vp.udiv.nxv4i8(<vscale x 4 x i8> [[VEC_IND]], <vscale x 4 x i8> [[BROADCAST_SPLAT2]], <vscale x 4 x i1> [[TMP0]], i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i8> [[TMP5]] to <vscale x 4 x i16>
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> [[TMP0]], <vscale x 4 x i16> [[BROADCAST_SPLAT4]], <vscale x 4 x i16> splat (i16 1), i32 [[TMP2]])
-; CHECK-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 4 x i16> [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16> [[TMP6]], <vscale x 4 x i16> [[BROADCAST_SPLAT4]], <vscale x 4 x i1> [[TMP0]], i32 [[TMP2]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[TMP8]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.vp.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[PREDPHI]], <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT6]], <vscale x 4 x i1> splat (i1 true), i32 [[TMP2]])
@@ -683,29 +671,30 @@ define void @udiv_sdiv_with_invariant_divisors(i8 %x, i16 %y, i1 %c, ptr %p) {
 ; FIXED-NEXT:  entry:
 ; FIXED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; FIXED:       vector.ph:
-; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[Y:%.*]], i64 0
-; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; FIXED-NEXT:    [[TMP0:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT]], splat (i1 true)
 ; FIXED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i8> poison, i8 [[X:%.*]], i64 0
 ; FIXED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i8> [[BROADCAST_SPLATINSERT1]], <4 x i8> poison, <4 x i32> zeroinitializer
-; FIXED-NEXT:    [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i8> splat (i8 1), <4 x i8> [[BROADCAST_SPLAT2]]
-; FIXED-NEXT:    [[TMP1:%.*]] = select i1 [[C]], <4 x i16> splat (i16 1), <4 x i16> [[BROADCAST_SPLAT]]
+; FIXED-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i16> poison, i16 [[Y:%.*]], i64 0
+; FIXED-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT3]], <4 x i16> poison, <4 x i32> zeroinitializer
 ; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; FIXED:       vector.body:
 ; FIXED-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; FIXED-NEXT:    [[VEC_IND:%.*]] = phi <4 x i8> [ <i8 -12, i8 -11, i8 -10, i8 -9>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP3:%.*]] = udiv <4 x i8> [[VEC_IND]], [[TMP0]]
-; FIXED-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i16>
-; FIXED-NEXT:    [[TMP5:%.*]] = sdiv <4 x i16> [[TMP4]], [[TMP1]]
-; FIXED-NEXT:    [[TMP6:%.*]] = sext <4 x i16> [[TMP5]] to <4 x i32>
-; FIXED-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> zeroinitializer, <4 x i32> [[TMP6]]
+; FIXED-NEXT:    [[TMP1:%.*]] = call <4 x i8> @llvm.masked.udiv.v4i8(<4 x i8> [[VEC_IND]], <4 x i8> [[BROADCAST_SPLAT2]], <4 x i1> [[TMP0]])
+; FIXED-NEXT:    [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i16>
+; FIXED-NEXT:    [[TMP3:%.*]] = call <4 x i16> @llvm.masked.sdiv.v4i16(<4 x i16> [[TMP2]], <4 x i16> [[BROADCAST_SPLAT4]], <4 x i1> [[TMP0]])
+; FIXED-NEXT:    [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32>
+; FIXED-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> zeroinitializer, <4 x i32> [[TMP4]]
 ; FIXED-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[PREDPHI]], i64 3
 ; FIXED-NEXT:    store i32 [[TMP7]], ptr [[P:%.*]], align 4
 ; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; FIXED-NEXT:    [[VEC_IND_NEXT]] = add <4 x i8> [[VEC_IND]], splat (i8 4)
-; FIXED-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
-; FIXED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; FIXED-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12
+; FIXED-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; FIXED:       middle.block:
-; FIXED-NEXT:    br label [[EXIT:%.*]]
+; FIXED-NEXT:    br label [[LOOP_LATCH:%.*]]
 ; FIXED:       exit:
 ; FIXED-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
index c35a3d7b9269f..5ca87262ec84f 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/pr154103.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
 ; RUN: opt -p loop-vectorize -mtriple riscv64 -mattr=+v < %s -S | FileCheck %s
 
-; Make sure we don't duplicate the safe divisor cost in the VPlan cost model.
+; Make sure we don't duplicate the llvm.masked.sdiv cost in the VPlan cost model.
 
 define void @pr154103(ptr noalias %a, ptr noalias %b, ptr noalias %c, ptr noalias %d) {
 ; CHECK-LABEL: define void @pr154103(
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
index 9974ac9797129..88f4c6e329b6b 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/tail-folding-div.ll
@@ -22,8 +22,7 @@ define void @test_sdiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = sdiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
@@ -117,8 +116,7 @@ define void @test_sdiv_divisor_invariant_nonconst(ptr noalias %a, i64 %b, ptr no
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
-; IF-EVL-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP0]])
-; IF-EVL-NEXT:    [[TMP3:%.*]] = sdiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP2]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP3]], ptr align 8 [[TMP4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
@@ -210,8 +208,7 @@ define void @test_sdiv_both_invariant_nonconst(ptr noalias %a, i64 %b, i64 %b2,
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP2]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
-; IF-EVL-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP1]])
-; IF-EVL-NEXT:    [[TMP8:%.*]] = sdiv <vscale x 2 x i64> [[BROADCAST_SPLAT]], [[TMP7]]
+; IF-EVL-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[BROADCAST_SPLAT]], <vscale x 2 x i64> [[BROADCAST_SPLAT2]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
 ; IF-EVL-NEXT:    [[TMP3:%.*]] = add <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP8]]
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP3]], ptr align 8 [[TMP4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP1]])
@@ -303,8 +300,7 @@ define void @test_sdiv_divisor_invariant_minusone(ptr noalias %a, ptr noalias %c
 ; IF-EVL-NEXT:    [[TMP0:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[AVL]], i32 2, i1 true)
 ; IF-EVL-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[A]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
-; IF-EVL-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i64> splat (i64 1), i32 [[TMP0]])
-; IF-EVL-NEXT:    [[TMP3:%.*]] = sdiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP2]]
+; IF-EVL-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vp.sdiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> splat (i64 -1), <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
 ; IF-EVL-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP3]], ptr align 8 [[TMP4]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP0]])
 ; IF-EVL-NEXT:    [[TMP5:%.*]] = zext i32 [[TMP0]] to i64
@@ -478,8 +474,7 @@ define void @test_udiv(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = udiv <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.udiv.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
@@ -573,8 +568,7 @@ define void @test_srem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = srem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.srem.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
@@ -668,8 +662,7 @@ define void @test_urem(ptr noalias %a, ptr noalias %b, ptr noalias %c) {
 ; IF-EVL-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP7]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    [[VP_OP_LOAD1:%.*]] = call <vscale x 2 x i64> @llvm.vp.load.nxv2i64.p0(ptr align 8 [[TMP9]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> splat (i1 true), <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i64> splat (i64 1), i32 [[TMP5]])
-; IF-EVL-NEXT:    [[VP_OP:%.*]] = urem <vscale x 2 x i64> [[VP_OP_LOAD]], [[TMP11]]
+; IF-EVL-NEXT:    [[VP_OP:%.*]] = call <vscale x 2 x i64> @llvm.vp.urem.nxv2i64(<vscale x 2 x i64> [[VP_OP_LOAD]], <vscale x 2 x i64> [[VP_OP_LOAD1]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[C]], i64 [[EVL_BASED_IV]]
 ; IF-EVL-NEXT:    call void @llvm.vp.store.nxv2i64.p0(<vscale x 2 x i64> [[VP_OP]], ptr align 8 [[TMP12]], <vscale x 2 x i1> splat (i1 true), i32 [[TMP5]])
 ; IF-EVL-NEXT:    [[TMP14:%.*]] = zext i32 [[TMP5]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll
index c47f884236d0e..df602c3aa19a1 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/first-order-recurrence-sink-replicate-region.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -force-widen-divrem-via-safe-divisor=0 -disable-output -vplan-print-after=printOptimizedVPlan 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -force-widen-divrem-via-masked-intrinsic=0 -disable-output -vplan-print-after=printOptimizedVPlan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
index d9eaab8d9a000..8160844d10b82 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-printing.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
 
-; RUN: opt -passes=loop-vectorize -vplan-print-after=printOptimizedVPlan -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses=true -enable-masked-interleaved-mem-accesses -force-widen-divrem-via-safe-divisor=0 -disable-output %s 2>&1 | FileCheck --strict-whitespace %s
+; RUN: opt -passes=loop-vectorize -vplan-print-after=printOptimizedVPlan -force-vector-interleave=1 -force-vector-width=4 -enable-interleaved-mem-accesses=true -enable-masked-interleaved-mem-accesses -force-widen-divrem-via-masked-intrinsic=0 -disable-output %s 2>&1 | FileCheck --strict-whitespace %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-sink-scalars-and-merge.ll
index bc1a557efb205..9cfdc992f0457 100644
--- a/llvm/test/Transforms/LoopVectorize/VPlan/vplan-sink-scalars-and-merge.ll
+++ b/llvm/test/Transforms/LoopVectorize/VPlan/vplan-sink-scalars-and-merge.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
-; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -vplan-print-after=printOptimizedVPlan -disable-output %s 2>&1 | FileCheck %s
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=0 -vplan-print-after=printOptimizedVPlan -disable-output %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
index 55789f84b790a..bcaa78386f5f4 100644
--- a/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
+++ b/llvm/test/Transforms/LoopVectorize/cse-replicate-regions.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
-; RUN: opt -p loop-vectorize -force-vector-width=2 -force-widen-divrem-via-safe-divisor=false -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=false -S %s | FileCheck %s
 
 define void @multiple_vppredinstphi_with_same_predicate(ptr %A, i32 %d) {
 ; CHECK-LABEL: define void @multiple_vppredinstphi_with_same_predicate(
diff --git a/llvm/test/Transforms/LoopVectorize/div-exact.ll b/llvm/test/Transforms/LoopVectorize/div-exact.ll
new file mode 100644
index 0000000000000..98e04b7dece92
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/div-exact.ll
@@ -0,0 +1,252 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -S < %s -p loop-vectorize -force-vector-width=4 | FileCheck %s
+
+define void @unknown_divisor(ptr noalias %p, i32 %n, i1 %c) {
+; CHECK-LABEL: define void @unknown_divisor(
+; CHECK-SAME: ptr noalias [[P:%.*]], i32 [[N:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[P]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP4]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i64 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i64 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP21]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]]
+; CHECK:       [[PRED_LOAD_IF7]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP24]], i64 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP22]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP25]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP28:%.*]] = call <4 x i32> @llvm.masked.udiv.v4i32(<4 x i32> splat (i32 100), <4 x i32> [[TMP23]], <4 x i1> [[BROADCAST_SPLAT]])
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP28]], i64 0
+; CHECK-NEXT:    store i32 [[TMP29]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK:       [[PRED_STORE_IF9]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP28]], i64 1
+; CHECK-NEXT:    store i32 [[TMP30]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK:       [[PRED_STORE_IF11]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP28]], i64 2
+; CHECK-NEXT:    store i32 [[TMP31]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; CHECK:       [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK:       [[PRED_STORE_IF13]]:
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x i32> [[TMP28]], i64 3
+; CHECK-NEXT:    store i32 [[TMP32]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; CHECK:       [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP33]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[LATCH]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = udiv exact i32 100, [[X]]
+; CHECK-NEXT:    store i32 [[Y]], ptr [[GEP]], align 4
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %latch]
+  br i1 %c, label %if, label %latch
+
+if:
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  %x = load i32, ptr %gep
+  %y = udiv exact i32 100, %x
+  store i32 %y, ptr %gep
+  br label %latch
+
+latch:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @unknown_dividend(ptr noalias %p, i32 %n, i1 %c) {
+; CHECK-LABEL: define void @unknown_dividend(
+; CHECK-SAME: ptr noalias [[P:%.*]], i32 [[N:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE14:.*]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[P]], i32 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP2]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i32 [[TMP4]]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF:.*]], label %[[PRED_LOAD_CONTINUE:.*]]
+; CHECK:       [[PRED_LOAD_IF]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i32> poison, i32 [[TMP12]], i64 0
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE]]
+; CHECK:       [[PRED_LOAD_CONTINUE]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ poison, %[[VECTOR_BODY]] ], [ [[TMP13]], %[[PRED_LOAD_IF]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF3:.*]], label %[[PRED_LOAD_CONTINUE4:.*]]
+; CHECK:       [[PRED_LOAD_IF3]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> [[TMP14]], i32 [[TMP16]], i64 1
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE4]]
+; CHECK:       [[PRED_LOAD_CONTINUE4]]:
+; CHECK-NEXT:    [[TMP18:%.*]] = phi <4 x i32> [ [[TMP14]], %[[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], %[[PRED_LOAD_IF3]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF5:.*]], label %[[PRED_LOAD_CONTINUE6:.*]]
+; CHECK:       [[PRED_LOAD_IF5]]:
+; CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP20]], i64 2
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE6]]
+; CHECK:       [[PRED_LOAD_CONTINUE6]]:
+; CHECK-NEXT:    [[TMP22:%.*]] = phi <4 x i32> [ [[TMP18]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP21]], %[[PRED_LOAD_IF5]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_LOAD_IF7:.*]], label %[[PRED_LOAD_CONTINUE8:.*]]
+; CHECK:       [[PRED_LOAD_IF7]]:
+; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x i32> [[TMP22]], i32 [[TMP24]], i64 3
+; CHECK-NEXT:    br label %[[PRED_LOAD_CONTINUE8]]
+; CHECK:       [[PRED_LOAD_CONTINUE8]]:
+; CHECK-NEXT:    [[TMP23:%.*]] = phi <4 x i32> [ [[TMP22]], %[[PRED_LOAD_CONTINUE6]] ], [ [[TMP25]], %[[PRED_LOAD_IF7]] ]
+; CHECK-NEXT:    [[TMP27:%.*]] = udiv exact <4 x i32> [[TMP23]], splat (i32 100)
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; CHECK:       [[PRED_STORE_IF]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i32> [[TMP27]], i64 0
+; CHECK-NEXT:    store i32 [[TMP28]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; CHECK:       [[PRED_STORE_CONTINUE]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; CHECK:       [[PRED_STORE_IF9]]:
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <4 x i32> [[TMP27]], i64 1
+; CHECK-NEXT:    store i32 [[TMP29]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; CHECK:       [[PRED_STORE_CONTINUE10]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
+; CHECK:       [[PRED_STORE_IF11]]:
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x i32> [[TMP27]], i64 2
+; CHECK-NEXT:    store i32 [[TMP30]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; CHECK:       [[PRED_STORE_CONTINUE12]]:
+; CHECK-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14]]
+; CHECK:       [[PRED_STORE_IF13]]:
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <4 x i32> [[TMP27]], i64 3
+; CHECK-NEXT:    store i32 [[TMP31]], ptr [[TMP10]], align 4
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE14]]
+; CHECK:       [[PRED_STORE_CONTINUE14]]:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP32]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LATCH:.*]] ]
+; CHECK-NEXT:    br i1 [[C]], label %[[IF:.*]], label %[[LATCH]]
+; CHECK:       [[IF]]:
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[P]], i32 [[IV]]
+; CHECK-NEXT:    [[X:%.*]] = load i32, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[Y:%.*]] = udiv exact i32 [[X]], 100
+; CHECK-NEXT:    store i32 [[Y]], ptr [[GEP]], align 4
+; CHECK-NEXT:    br label %[[LATCH]]
+; CHECK:       [[LATCH]]:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %latch]
+  br i1 %c, label %if, label %latch
+
+if:
+  %gep = getelementptr i32, ptr %p, i32 %iv
+  %x = load i32, ptr %gep
+  %y = udiv exact i32 %x, 100
+  store i32 %y, ptr %gep
+  br label %latch
+
+latch:
+  %iv.next = add i32 %iv, 1
+  %ec = icmp eq i32 %iv.next, %n
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr-tail-folding.ll
index 9ddde3c012afe..425bda65b8377 100644
--- a/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr-tail-folding.ll
+++ b/llvm/test/Transforms/LoopVectorize/find-last-iv-sinkable-expr-tail-folding.ll
@@ -459,8 +459,7 @@ define i64 @findlast_sdiv_iv_as_divisor(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP2]], <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_MASKED_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i64> splat (i64 100), [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.masked.sdiv.v4i64(<4 x i64> splat (i64 100), <4 x i64> [[VEC_IND]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
@@ -720,8 +719,7 @@ define i64 @findlast_udiv_may_trap_due_to_sentinel(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP2]], <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_MASKED_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = udiv <4 x i64> splat (i64 100), [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.masked.udiv.v4i64(<4 x i64> splat (i64 100), <4 x i64> [[VEC_IND]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
@@ -783,8 +781,7 @@ define i64 @findlast_srem_iv_as_divisor(ptr %a, i64 %n) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr align 8 [[TMP3]], <4 x i1> [[TMP2]], <4 x i64> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq <4 x i64> [[WIDE_MASKED_LOAD]], splat (i64 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = select <4 x i1> [[TMP2]], <4 x i64> [[VEC_IND]], <4 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP6:%.*]] = srem <4 x i64> splat (i64 100), [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i64> @llvm.masked.srem.v4i64(<4 x i64> splat (i64 100), <4 x i64> [[VEC_IND]], <4 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = select <4 x i1> [[TMP2]], <4 x i1> [[TMP4]], <4 x i1> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = freeze <4 x i1> [[TMP7]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]])
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
index 315cd4d2d968e..d1e7cb044f790 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s --check-prefix=SINK-AFTER
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=1 -force-vector-interleave=2 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s --check-prefix=UNROLL-NO-VF
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s --check-prefix=SINK-AFTER
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
index 0beb458e84c85..007d00f33ba55 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-non-void.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
-; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -passes=loop-vectorize -verify-loop-info -force-widen-divrem-via-safe-divisor=0 < %s | FileCheck %s
-; RUN: opt -S -force-vector-width=1 -force-vector-interleave=2 -passes=loop-vectorize -verify-loop-info -force-widen-divrem-via-safe-divisor=0 < %s | FileCheck %s --check-prefix=UNROLL-NO-VF
+; RUN: opt -S -force-vector-width=2 -force-vector-interleave=1 -passes=loop-vectorize -verify-loop-info -force-widen-divrem-via-masked-intrinsic=0 < %s | FileCheck %s
+; RUN: opt -S -force-vector-width=1 -force-vector-interleave=2 -passes=loop-vectorize -verify-loop-info -force-widen-divrem-via-masked-intrinsic=0 < %s | FileCheck %s --check-prefix=UNROLL-NO-VF
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
index f9886bb3c0033..7b80a460a8bea 100644
--- a/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/llvm/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -746,18 +746,23 @@ define void @sdiv_with_uniform_ops(i16 %0, i1 %c, ptr %dst) {
 ; VEC-NEXT:  [[ENTRY:.*:]]
 ; VEC-NEXT:    br label %[[VECTOR_PH:.*]]
 ; VEC:       [[VECTOR_PH]]:
+; VEC-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
+; VEC-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
+; VEC-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT1]], <2 x i1> poison, <2 x i32> zeroinitializer
+; VEC-NEXT:    [[TMP4:%.*]] = call <2 x i16> @llvm.masked.sdiv.v2i16(<2 x i16> splat (i16 10), <2 x i16> [[BROADCAST_SPLAT]], <2 x i1> [[BROADCAST_SPLAT2]])
 ; VEC-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VEC:       [[VECTOR_BODY]]:
 ; VEC-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
 ; VEC-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; VEC:       [[PRED_STORE_IF]]:
-; VEC-NEXT:    [[TMP1:%.*]] = sdiv i16 10, [[TMP0]]
+; VEC-NEXT:    [[TMP1:%.*]] = extractelement <2 x i16> [[TMP4]], i64 0
 ; VEC-NEXT:    store i16 [[TMP1]], ptr [[DST]], align 1
 ; VEC-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; VEC:       [[PRED_STORE_CONTINUE]]:
-; VEC-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; VEC:       [[PRED_STORE_IF1]]:
-; VEC-NEXT:    [[TMP2:%.*]] = sdiv i16 10, [[TMP0]]
+; VEC-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE2]]
+; VEC:       [[PRED_STORE_IF3]]:
+; VEC-NEXT:    [[TMP2:%.*]] = extractelement <2 x i16> [[TMP4]], i64 1
 ; VEC-NEXT:    store i16 [[TMP2]], ptr [[DST]], align 1
 ; VEC-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
 ; VEC:       [[PRED_STORE_CONTINUE2]]:
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 3b44b99b1ddeb..94d6f3a27141b 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s --check-prefix=IND
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=2 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s --check-prefix=UNROLL
-; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=2 -force-vector-width=4 -force-widen-divrem-via-safe-divisor=0 -enable-interleaved-mem-accesses -S | FileCheck %s --check-prefix=INTERLEAVE
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s --check-prefix=IND
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=2 -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=2 -force-vector-width=4 -force-widen-divrem-via-masked-intrinsic=0 -enable-interleaved-mem-accesses -S | FileCheck %s --check-prefix=INTERLEAVE
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
index d41049562154b..0f8967f11bf24 100644
--- a/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
+++ b/llvm/test/Transforms/LoopVectorize/load-deref-pred-poison-ub-ops-feeding-pointer.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-widen-divrem-via-safe-divisor=false -force-vector-width=2 -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-widen-divrem-via-masked-intrinsic=false -force-vector-width=2 -S %s | FileCheck %s
 
 target datalayout="p:16:16"
 
diff --git a/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll b/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
index 5264bb59fafe3..3b43149a7f972 100644
--- a/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr154045-dont-fold-extractelement-livein.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
-; RUN: opt -p loop-vectorize -force-vector-width=2 -force-widen-divrem-via-safe-divisor=false -S %s | FileCheck %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-widen-divrem-via-masked-intrinsic=false -S %s | FileCheck %s
 
 ; Make sure we don't try to fold a Instruction::ExtractElement ir<0>, ir<0>,
 ; since we can't materialize the live-in for the vector operand.
diff --git a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
index d4897f8a28381..a0675f4a63612 100644
--- a/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr44488-predication.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -force-widen-divrem-via-safe-divisor=0 -passes=loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -force-widen-divrem-via-masked-intrinsic=0 -passes=loop-vectorize -S | FileCheck %s
 
 ; Test case for PR44488. Checks that the correct predicates are created for
 ; branches where true and false successors are equal. See the checks involving
diff --git a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
index 8da9a611b8535..4c4b61441b409 100644
--- a/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
+++ b/llvm/test/Transforms/LoopVectorize/predicatedinst-loop-invariant.ll
@@ -113,8 +113,7 @@ define void @loop_invariant_srem(ptr %p, i64 %a, i8 %b) {
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> [[TMP3]], <4 x i32> [[TMP2]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl <4 x i32> [[PREDPHI]], splat (i32 8)
 ; CHECK-NEXT:    [[TMP8:%.*]] = trunc <4 x i32> [[TMP7]] to <4 x i8>
-; CHECK-NEXT:    [[TMP9:%.*]] = select <4 x i1> [[TMP4]], <4 x i8> [[TMP8]], <4 x i8> splat (i8 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = srem <4 x i8> [[VEC_IND1]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call <4 x i8> @llvm.masked.srem.v4i8(<4 x i8> [[VEC_IND1]], <4 x i8> [[TMP8]], <4 x i1> [[TMP4]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x i1> [[TMP4]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
diff --git a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
index 7272085801b1a..2cfe6bc8a2f06 100644
--- a/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
+++ b/llvm/test/Transforms/LoopVectorize/preserve-dbg-loc-and-loop-metadata.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-widen-divrem-via-safe-divisor=0 -S 2>&1 | FileCheck %s
-; RUN: opt < %s -passes=debugify,loop-vectorize -force-vector-width=4 -force-widen-divrem-via-safe-divisor=0 -S | FileCheck %s -check-prefix DEBUGLOC
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-widen-divrem-via-masked-intrinsic=0 -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=debugify,loop-vectorize -force-vector-width=4 -force-widen-divrem-via-masked-intrinsic=0 -S | FileCheck %s -check-prefix DEBUGLOC
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; This test makes sure we don't duplicate the loop vectorizer's metadata
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
index d2f6468a95a1b..f3c72a8f9e9a9 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-small-size.ll
@@ -11,9 +11,10 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <4 x i32> poison, i32 [[X1:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT3]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -30,7 +31,7 @@ define i8 @PR34687(i1 %c, i32 %x, i32 %n, i32 %divisor) {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP6:%.*]] = sdiv <4 x i32> [[VEC_IND]], [[TMP9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.masked.sdiv.v4i32(<4 x i32> [[VEC_IND]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i1> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[PREDPHI1:%.*]] = select i1 [[C]], <4 x i32> [[TMP6]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP3]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext i8 [[TMP7]] to i32
@@ -98,8 +99,9 @@ define i8 @PR34687_no_undef(i1 %c, i32 %x, i32 %n) {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = select i1 [[C:%.*]], <4 x i32> [[BROADCAST_SPLAT2]], <4 x i32> splat (i32 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <4 x i32> splat (i32 99), [[TMP0]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <4 x i1> poison, i1 [[C:%.*]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.masked.sdiv.v4i32(<4 x i32> splat (i32 99), <4 x i32> [[BROADCAST_SPLAT2]], <4 x i1> [[BROADCAST_SPLAT3]])
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <4 x i32> [[TMP1]], <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
diff --git a/llvm/test/Transforms/LoopVectorize/struct-return.ll b/llvm/test/Transforms/LoopVectorize/struct-return.ll
index 48168743467f9..db3bce81cc063 100644
--- a/llvm/test/Transforms/LoopVectorize/struct-return.ll
+++ b/llvm/test/Transforms/LoopVectorize/struct-return.ll
@@ -76,7 +76,7 @@ define void @struct_return_f64_widen(ptr noalias %in, ptr noalias writeonly %out
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
@@ -138,7 +138,7 @@ define void @struct_return_f32_widen_rt_checks(ptr %in, ptr writeonly %out_a, pt
 ; CHECK-NEXT:    store <2 x float> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br [[EXIT:label %.*]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -188,7 +188,7 @@ define void @test_overflow_intrinsic(ptr noalias readonly %in, ptr noalias write
 ; CHECK-NEXT:    store <2 x i8> [[TMP4]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 2
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
@@ -235,7 +235,7 @@ define void @struct_return_i32_three_results_widen(ptr noalias %in, ptr noalias
 ; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
@@ -271,7 +271,7 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE4:.*]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
@@ -279,27 +279,52 @@ define void @scalarized_predicated_struct_return(ptr %a) {
 ; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
 ; CHECK:       [[PRED_STORE_IF]]:
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i64 0
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP5]], [[TMP3]]
-; CHECK-NEXT:    store i64 [[TMP7]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP3]]) #[[ATTR3:[0-9]+]]
 ; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE]]
 ; CHECK:       [[PRED_STORE_CONTINUE]]:
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP1]], i64 1
-; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2]]
-; CHECK:       [[PRED_STORE_IF1]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi { i64, i64 } [ poison, %[[VECTOR_BODY]] ], [ [[TMP10]], %[[PRED_STORE_IF]] ]
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <2 x i1> [[TMP1]], i64 1
+; CHECK-NEXT:    br i1 [[TMP29]], label %[[PRED_CALL_IF1:.*]], label %[[PRED_CALL_CONTINUE2:.*]]
+; CHECK:       [[PRED_CALL_IF1]]:
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i64 1
-; CHECK-NEXT:    [[TMP12:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR2]]
+; CHECK-NEXT:    [[TMP24:%.*]] = tail call { i64, i64 } @bar_i64(i64 [[TMP11]]) #[[ATTR3]]
+; CHECK-NEXT:    br label %[[PRED_CALL_CONTINUE2]]
+; CHECK:       [[PRED_CALL_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP12:%.*]] = phi { i64, i64 } [ poison, %[[PRED_STORE_CONTINUE]] ], [ [[TMP24]], %[[PRED_CALL_IF1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { i64, i64 } [[TMP4]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP5]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertvalue { <2 x i64>, <2 x i64> } poison, <2 x i64> [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { i64, i64 } [[TMP4]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP7]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP8]], i64 0
+; CHECK-NEXT:    [[TMP30:%.*]] = insertvalue { <2 x i64>, <2 x i64> } [[TMP7]], <2 x i64> [[TMP14]], 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { i64, i64 } [[TMP12]], 0
-; CHECK-NEXT:    [[TMP15:%.*]] = udiv i64 [[TMP13]], [[TMP11]]
+; CHECK-NEXT:    [[TMP32:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP30]], 0
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <2 x i64> [[TMP32]], i64 [[TMP13]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertvalue { <2 x i64>, <2 x i64> } [[TMP30]], <2 x i64> [[TMP33]], 0
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { i64, i64 } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP21:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP19]], 1
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <2 x i64> [[TMP21]], i64 [[TMP20]], i64 1
+; CHECK-NEXT:    [[TMP23:%.*]] = insertvalue { <2 x i64>, <2 x i64> } [[TMP19]], <2 x i64> [[TMP22]], 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP23]], 0
+; CHECK-NEXT:    [[TMP26:%.*]] = call <2 x i64> @llvm.masked.udiv.v2i64(<2 x i64> [[TMP25]], <2 x i64> [[WIDE_LOAD]], <2 x i1> [[TMP1]])
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE1:.*]]
+; CHECK:       [[PRED_STORE_IF2]]:
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <2 x i64> [[TMP26]], i64 0
+; CHECK-NEXT:    store i64 [[TMP28]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE1]]
+; CHECK:       [[PRED_STORE_CONTINUE1]]:
+; CHECK-NEXT:    br i1 [[TMP29]], label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_IF3]]:
 ; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i64> [[TMP26]], i64 1
 ; CHECK-NEXT:    store i64 [[TMP15]], ptr [[TMP17]], align 8
-; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE2]]
-; CHECK:       [[PRED_STORE_CONTINUE2]]:
+; CHECK-NEXT:    br label %[[PRED_STORE_CONTINUE4]]
+; CHECK:       [[PRED_STORE_CONTINUE4]]:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
@@ -342,7 +367,7 @@ define void @negative_struct_of_vectors(ptr noalias %in, ptr noalias writeonly %
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load <1 x float>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { <1 x float>, <1 x float> } @foo(<1 x float> [[IN_VAL]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { <1 x float>, <1 x float> } @foo(<1 x float> [[IN_VAL]]) #[[ATTR2:[0-9]+]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue { <1 x float>, <1 x float> } [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue { <1 x float>, <1 x float> } [[CALL]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]]
@@ -398,7 +423,7 @@ define void @mixed_element_type_struct_return(ptr noalias %in, ptr noalias write
 ; CHECK-NEXT:    store <2 x i32> [[TMP3]], ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], 2
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
@@ -439,7 +464,7 @@ define void @negative_named_struct_return(ptr noalias readonly %in, ptr noalias
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load double, ptr [[ARRAYIDX]], align 8
-; CHECK-NEXT:    [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call [[NAMED_STRUCT:%.*]] @[[BAR_NAMED:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](double [[IN_VAL]]) #[[ATTR4:[0-9]+]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue [[NAMED_STRUCT]] [[CALL]], 1
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[OUT_A]], i64 [[IV]]
@@ -485,7 +510,7 @@ define void @negative_nested_struct(ptr noalias %in, ptr noalias writeonly %out_
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { { float, float } } @foo_nested_struct(float [[IN_VAL]]) #[[ATTR1]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { { float, float } } @foo_nested_struct(float [[IN_VAL]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[EXTRACT_INNER:%.*]] = extractvalue { { float, float } } [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue { float, float } [[EXTRACT_INNER]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue { float, float } [[EXTRACT_INNER]], 1
@@ -533,7 +558,7 @@ define void @negative_non_widenable_element(ptr noalias %in, ptr noalias writeon
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, [1 x float] } @foo_one_non_widenable_element(float [[IN_VAL]]) #[[ATTR1]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, [1 x float] } @foo_one_non_widenable_element(float [[IN_VAL]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue { float, [1 x float] } [[CALL]], 0
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[OUT_A]], i64 [[IV]]
 ; CHECK-NEXT:    store float [[EXTRACT_A]], ptr [[ARRAYIDX2]], align 4
@@ -573,7 +598,7 @@ define void @negative_struct_array_elements(ptr noalias %in, ptr noalias writeon
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { [2 x float] } @foo_arrays(float [[IN_VAL]]) #[[ATTR1]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { [2 x float] } @foo_arrays(float [[IN_VAL]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[EXTRACT_INNER:%.*]] = extractvalue { [2 x float] } [[CALL]], 0
 ; CHECK-NEXT:    [[EXTRACT_A:%.*]] = extractvalue [2 x float] [[EXTRACT_INNER]], 0
 ; CHECK-NEXT:    [[EXTRACT_B:%.*]] = extractvalue [2 x float] [[EXTRACT_INNER]], 1
@@ -665,7 +690,7 @@ define void @negative_struct_return_store_struct(ptr noalias %in, ptr noalias wr
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds { float, float }, ptr [[IN]], i64 [[IV]]
 ; CHECK-NEXT:    [[IN_VAL:%.*]] = load float, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, float } @foo(float [[IN_VAL]]) #[[ATTR1]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call { float, float } @foo(float [[IN_VAL]]) #[[ATTR2]]
 ; CHECK-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds { float, float }, ptr [[OUT]], i64 [[IV]]
 ; CHECK-NEXT:    store { float, float } [[CALL]], ptr [[OUT_PTR]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-div.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-div.ll
index 44d5bd45d511a..8767d44723e61 100644
--- a/llvm/test/Transforms/LoopVectorize/tail-folding-div.ll
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-div.ll
@@ -17,8 +17,7 @@ define void @test_sdiv_variant_divisor_induction(ptr noalias %a, ptr noalias %c)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i16> [[VEC_IND2]], splat (i16 1024)
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 4 [[TMP4]], <2 x i1> [[TMP2]], <2 x i64> poison)
-; CHECK-NEXT:    [[VEC_IND:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[VEC_IND1]], <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP13:%.*]] = sdiv <2 x i64> [[TMP12]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i64> @llvm.masked.sdiv.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[VEC_IND1]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[C]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP13]], ptr align 4 [[TMP15]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
@@ -65,8 +64,7 @@ define void @test_sdiv_variant_divisor_load(ptr noalias %a, ptr noalias %b, ptr
 ; CHECK-NEXT:    [[TMP19:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 4 [[TMP13]], <2 x i1> [[TMP2]], <2 x i64> poison)
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[B]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 4 [[TMP16]], <2 x i1> [[TMP2]], <2 x i64> poison)
-; CHECK-NEXT:    [[TMP21:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP20]], <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP22:%.*]] = sdiv <2 x i64> [[TMP19]], [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i64> @llvm.masked.sdiv.v2i64(<2 x i64> [[TMP19]], <2 x i64> [[TMP20]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i64, ptr [[C]], i64 [[TMP1]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP22]], ptr align 4 [[TMP27]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 2
@@ -114,8 +112,7 @@ define void @test_sdiv_invariant_divisor_nonconst(ptr noalias %a, i64 %b, ptr no
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i16> [[VEC_IND]], splat (i16 1024)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 4 [[TMP9]], <2 x i1> [[TMP2]], <2 x i64> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP14:%.*]] = sdiv <2 x i64> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x i64> @llvm.masked.sdiv.v2i64(<2 x i64> [[TMP12]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[C]], i64 [[TMP1]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP14]], ptr align 4 [[TMP19]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 2
@@ -159,8 +156,7 @@ define void @test_sdiv_invariant_divisor_minusone(ptr noalias %a, ptr noalias %c
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <2 x i16> [[VEC_IND]], splat (i16 1024)
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 4 [[TMP9]], <2 x i1> [[TMP2]], <2 x i64> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> splat (i64 -1), <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP14:%.*]] = sdiv <2 x i64> [[TMP12]], [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <2 x i64> @llvm.masked.sdiv.v2i64(<2 x i64> [[TMP12]], <2 x i64> splat (i64 -1), <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[C]], i64 [[TMP1]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP14]], ptr align 4 [[TMP19]], <2 x i1> [[TMP2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP1]], 2
@@ -248,8 +244,7 @@ define void @test_sdiv_variant_dividend_induction(i64 %a, ptr noalias %c) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule <2 x i64> [[VEC_IND]], splat (i64 1024)
-; CHECK-NEXT:    [[TMP11:%.*]] = select <2 x i1> [[TMP0]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP1:%.*]] = sdiv <2 x i64> [[VEC_IND]], [[TMP11]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.masked.sdiv.v2i64(<2 x i64> [[VEC_IND]], <2 x i64> [[BROADCAST_SPLAT]], <2 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[C]], i64 [[TMP7]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP1]], ptr align 4 [[TMP8]], <2 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[TMP7]], 2
@@ -295,8 +290,7 @@ define void @test_sdiv_both_invariant_nonconst(ptr noalias %a, i64 %b, i64 %b2,
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule <2 x i16> [[VEC_IND]], splat (i16 1024)
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[A]], i64 [[TMP2]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr align 4 [[TMP10]], <2 x i1> [[TMP3]], <2 x i64> poison)
-; CHECK-NEXT:    [[TMP13:%.*]] = select <2 x i1> [[TMP3]], <2 x i64> [[BROADCAST_SPLAT2]], <2 x i64> splat (i64 1)
-; CHECK-NEXT:    [[TMP23:%.*]] = sdiv <2 x i64> [[BROADCAST_SPLAT]], [[TMP13]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call <2 x i64> @llvm.masked.sdiv.v2i64(<2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT2]], <2 x i1> [[TMP3]])
 ; CHECK-NEXT:    [[TMP14:%.*]] = add <2 x i64> [[TMP22]], [[TMP23]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i64, ptr [[C]], i64 [[TMP2]]
 ; CHECK-NEXT:    call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP14]], ptr align 4 [[TMP19]], <2 x i1> [[TMP3]])
diff --git a/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll b/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll
index 333461e16bb14..d98fa2e3d91ce 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-to-scalar-cast.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; The test was crashing earlier due to a vectorization attempt; vector-to-scalar cast
 ; is now forbidden in the legalizer, and the test isn't a vectorization candidate now.
-; RUN: opt -S -force-widen-divrem-via-safe-divisor=false -force-vector-width=4 --passes=loop-vectorize < %s | FileCheck %s
+; RUN: opt -S -force-widen-divrem-via-masked-intrinsic=false -force-vector-width=4 --passes=loop-vectorize < %s | FileCheck %s
 
 define void @vector_to_scalar_cast(ptr %out) {
 ; CHECK-LABEL: define void @vector_to_scalar_cast(

From 21a8d850b8b712755f26143d36034d2daa9f47c0 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@arm.com>
Date: Thu, 14 May 2026 13:05:38 +0100
Subject: [PATCH 66/95] [lldb] Member initialise hardware breakpoint structures
 (#197127)

These are delcared in NativeRegisterContextDBReg so we should zero-init
them there rather than have everyone memset them later.

ppc64le has its own equivalent that I've made the same change to.
---
 .../Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp    | 3 ---
 .../Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp  | 2 --
 .../Process/Linux/NativeRegisterContextLinux_arm64.cpp        | 3 ---
 .../Process/Linux/NativeRegisterContextLinux_loongarch64.cpp  | 3 ---
 .../Process/Linux/NativeRegisterContextLinux_ppc64le.cpp      | 1 -
 .../Process/Linux/NativeRegisterContextLinux_ppc64le.h        | 2 +-
 .../Plugins/Process/Utility/NativeRegisterContextDBReg.h      | 4 ++--
 7 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
index f50b28e2ebd1d..74c55224fed49 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
@@ -59,9 +59,6 @@ NativeRegisterContextFreeBSD_arm64::NativeRegisterContextFreeBSD_arm64(
   g_register_flags_detector.UpdateRegisterInfo(
       GetRegisterInfoInterface().GetRegisterInfo(),
       GetRegisterInfoInterface().GetRegisterCount());
-
-  ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
-  ::memset(&m_hbp_regs, 0, sizeof(m_hbp_regs));
 }
 
 RegisterInfoPOSIX_arm64 &
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
index c83cea2bbf5bd..21b6bb408caad 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm.cpp
@@ -78,8 +78,6 @@ NativeRegisterContextLinux_arm::NativeRegisterContextLinux_arm(
   ::memset(&m_fpr, 0, sizeof(m_fpr));
   ::memset(&m_tls, 0, sizeof(m_tls));
   ::memset(&m_gpr_arm, 0, sizeof(m_gpr_arm));
-  ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
-  ::memset(&m_hbp_regs, 0, sizeof(m_hbp_regs));
 
   // 16 is just a maximum value, query hardware for actual watchpoint count
   m_max_hwp_supported = 16;
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
index 52e6458658d70..cae579e533523 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_arm64.cpp
@@ -214,9 +214,6 @@ NativeRegisterContextLinux_arm64::NativeRegisterContextLinux_arm64(
       GetRegisterInfoInterface().GetRegisterInfo(),
       GetRegisterInfoInterface().GetRegisterCount());
 
-  ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
-  ::memset(&m_hbp_regs, 0, sizeof(m_hbp_regs));
-
   // 16 is just a maximum value, query hardware for actual watchpoint count
   m_max_hwp_supported = 16;
   m_max_hbp_supported = 16;
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp
index c4841950f1e07..ed3d43ca48020 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_loongarch64.cpp
@@ -103,9 +103,6 @@ NativeRegisterContextLinux_loongarch64::NativeRegisterContextLinux_loongarch64(
   ::memset(&m_lsx, 0, sizeof(m_lsx));
   ::memset(&m_lasx, 0, sizeof(m_lasx));
 
-  ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
-  ::memset(&m_hbp_regs, 0, sizeof(m_hbp_regs));
-
   // Refer to:
   // https://loongson.github.io/LoongArch-Documentation/LoongArch-Vol1-EN.html#control-and-status-registers-related-to-watchpoints
   // 14 is just a maximum value, query hardware for actual watchpoint count.
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.cpp b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.cpp
index 0b8571a8d6704..f4b539bf2b4c3 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.cpp
@@ -142,7 +142,6 @@ NativeRegisterContextLinux_ppc64le::NativeRegisterContextLinux_ppc64le(
   ::memset(&m_fpr_ppc64le, 0, sizeof(m_fpr_ppc64le));
   ::memset(&m_vmx_ppc64le, 0, sizeof(m_vmx_ppc64le));
   ::memset(&m_vsx_ppc64le, 0, sizeof(m_vsx_ppc64le));
-  ::memset(&m_hwp_regs, 0, sizeof(m_hwp_regs));
 }
 
 uint32_t NativeRegisterContextLinux_ppc64le::GetRegisterSetCount() const {
diff --git a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.h b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.h
index 942c65fcd241d..4ca9d43a24926 100644
--- a/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.h
+++ b/lldb/source/Plugins/Process/Linux/NativeRegisterContextLinux_ppc64le.h
@@ -122,7 +122,7 @@ class NativeRegisterContextLinux_ppc64le : public NativeRegisterContextLinux {
     int mode;               // Defines if watchpoint is read/write/access.
   };
 
-  std::array<DREG, 16> m_hwp_regs;
+  std::array<DREG, 16> m_hwp_regs{};
 
   // 16 is just a maximum value, query hardware for actual watchpoint count
   uint32_t m_max_hwp_supported = 16;
diff --git a/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg.h b/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg.h
index 721c15e1ee900..60d1bfa38dd01 100644
--- a/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg.h
+++ b/lldb/source/Plugins/Process/Utility/NativeRegisterContextDBReg.h
@@ -65,8 +65,8 @@ class NativeRegisterContextDBReg
   };
 
 protected:
-  std::array<struct DREG, 16> m_hbp_regs; // hardware breakpoints
-  std::array<struct DREG, 16> m_hwp_regs; // hardware watchpoints
+  std::array<struct DREG, 16> m_hbp_regs{}; // hardware breakpoints
+  std::array<struct DREG, 16> m_hwp_regs{}; // hardware watchpoints
 
   uint32_t m_max_hbp_supported;
   uint32_t m_max_hwp_supported;

From 93715644a7ed9191cc037b04d1d9be83ea8b972d Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 14 May 2026 13:05:52 +0100
Subject: [PATCH 67/95] [libc][math] Fix exp10m1f(-0) in SKIP_ACCURATE_PASS
 mode (#197650)

exp10m1f(-0) should return -0, just like expm1f does. But if you build
with the LIBC_MATH_SKIP_ACCURATE_PASS flag, it accidentally returned +0,
and failed the src.math.smoke.exp10m1f_test test.

The check for -0 is normally done by EXP10M1F_EXCEPTS_LO, a list of
cases that are misrounded by the calculation in the branch for small
input values. In SKIP_ACCURATE_PASS, that list is omitted, trading off
accuracy for code size. But the check for -0 went with them. The fix is
to reinsert that in a `#else` clause, if the list isn't included.
---
 libc/src/__support/math/exp10m1f.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/libc/src/__support/math/exp10m1f.h b/libc/src/__support/math/exp10m1f.h
index b423f24808ed0..d0edee49eedc4 100644
--- a/libc/src/__support/math/exp10m1f.h
+++ b/libc/src/__support/math/exp10m1f.h
@@ -132,6 +132,15 @@ LIBC_INLINE LIBC_CONSTEXPR float exp10m1f(float x) {
 #ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
     if (auto r = EXP10M1F_EXCEPTS_LO.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
       return r.value();
+#else
+    // Even if we're not checking for the misrounded cases in this interval, we
+    // must still check for -0 as input and return -0 as output, rather than +0
+    // as the code below would compute.
+    //
+    // We might as well check for both zeroes at once, in fact, since it's no
+    // slower.
+    if (LIBC_UNLIKELY(x_abs == 0))
+      return x;
 #endif // !LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
     double dx = x;

From 5366692aeefd1101cb8dfde7fb65c9565b476d96 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 14 May 2026 14:11:56 +0200
Subject: [PATCH 68/95] [libc] Remove legacy SYS_socketcall fallbacks (#197189)

This patch removes legacy SYS_socketcall fallback paths from all Linux
socket entry points. The individual syscall entry points were
[added](https://github.com/torvalds/linux/commit/9dea5dc921b5f4045a18c63eb92e84dc274d17eb)
in linux 4.3 (on x86, other architectures have had them even sooner).
Our policy is to support the lowest kernel version on
https://kernel.org/, which is 5.10 as of this writing.

This is motivated by the problems in

[testing](https://github.com/llvm/llvm-project/pull/196903#pullrequestreview-4263553670)
the fallback paths -- to make sure this even builds, one needs to get a
hold of very old kernel headers, or otherwise hack its build to force it
to select the fallback path.

New ABIs don't have the accept syscall (only accept4), so I've added an
accept->accept4 fallback.

Assisted by Gemini.
---
 .../OSUtil/linux/syscall_wrappers/accept.h     | 16 +++++-----------
 .../OSUtil/linux/syscall_wrappers/accept4.h    | 16 +---------------
 .../OSUtil/linux/syscall_wrappers/connect.h    | 15 +--------------
 .../OSUtil/linux/syscall_wrappers/getsockopt.h | 13 -------------
 .../OSUtil/linux/syscall_wrappers/listen.h     | 13 +------------
 .../OSUtil/linux/syscall_wrappers/setsockopt.h | 13 -------------
 .../OSUtil/linux/syscall_wrappers/shutdown.h   | 13 +------------
 libc/src/sys/socket/linux/bind.cpp             | 14 +-------------
 libc/src/sys/socket/linux/recv.cpp             | 16 ++++------------
 libc/src/sys/socket/linux/recvfrom.cpp         | 18 ++----------------
 libc/src/sys/socket/linux/recvmsg.cpp          | 14 +-------------
 libc/src/sys/socket/linux/send.cpp             | 16 ++++------------
 libc/src/sys/socket/linux/sendmsg.cpp          | 14 +-------------
 libc/src/sys/socket/linux/sendto.cpp           | 18 ++----------------
 libc/src/sys/socket/linux/socket.cpp           | 14 +-------------
 libc/src/sys/socket/linux/socketpair.cpp       | 14 +-------------
 16 files changed, 26 insertions(+), 211 deletions(-)

diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/accept.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/accept.h
index e661f4df1daab..b25f0fac1f071 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/accept.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/accept.h
@@ -16,7 +16,6 @@
 
 #include "hdr/types/socklen_t.h"
 #include "hdr/types/struct_sockaddr.h"
-#include <linux/net.h>   // For SYS_ACCEPT socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
@@ -24,17 +23,12 @@ namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> accept(int sockfd, struct sockaddr *addr,
                                 socklen_t *addrlen) {
-#ifdef SYS_accept
-  int ret =
-      LIBC_NAMESPACE::syscall_impl<int>(SYS_accept, sockfd, addr, addrlen);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(addr),
-                                    reinterpret_cast<unsigned long>(addrlen)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_ACCEPT,
-                                              sockcall_args);
+#if defined(SYS_accept)
+  int ret = syscall_impl<int>(SYS_accept, sockfd, addr, addrlen);
+#elif defined(SYS_accept4)
+  int ret = syscall_impl<int>(SYS_accept4, sockfd, addr, addrlen, 0);
 #else
-#error "accept and socketcall syscalls unavailable for this platform."
+#error "accept and accept4 syscalls unavailable for this platform."
 #endif
 
   if (ret < 0)
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/accept4.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/accept4.h
index ebe82f189af50..69c316fb72766 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/accept4.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/accept4.h
@@ -16,7 +16,6 @@
 
 #include "hdr/types/socklen_t.h"
 #include "hdr/types/struct_sockaddr.h"
-#include <linux/net.h>   // For SYS_ACCEPT4 socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
@@ -24,20 +23,7 @@ namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> accept4(int sockfd, struct sockaddr *addr,
                                  socklen_t *addrlen, int flags) {
-#ifdef SYS_accept4
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_accept4, sockfd, addr,
-                                              addrlen, flags);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[4] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(addr),
-                                    reinterpret_cast<unsigned long>(addrlen),
-                                    static_cast<unsigned long>(flags)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_ACCEPT4,
-                                              sockcall_args);
-#else
-#error "accept4 and socketcall syscalls unavailable for this platform."
-#endif
-
+  int ret = syscall_impl<int>(SYS_accept4, sockfd, addr, addrlen, flags);
   if (ret < 0)
     return Error(-static_cast<int>(ret));
   return ret;
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/connect.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/connect.h
index 6974546630b00..9ddc4674e4d9f 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/connect.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/connect.h
@@ -16,7 +16,6 @@
 
 #include "hdr/types/socklen_t.h"
 #include "hdr/types/struct_sockaddr.h"
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
@@ -24,19 +23,7 @@ namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> connect(int sockfd, const struct sockaddr *addr,
                                  socklen_t addrlen) {
-#ifdef SYS_connect
-  int ret =
-      LIBC_NAMESPACE::syscall_impl<int>(SYS_connect, sockfd, addr, addrlen);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(addr),
-                                    static_cast<unsigned long>(addrlen)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_CONNECT,
-                                              sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
-
+  int ret = syscall_impl<int>(SYS_connect, sockfd, addr, addrlen);
   if (ret < 0)
     return Error(-static_cast<int>(ret));
   return ret;
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockopt.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockopt.h
index 623ba58ebb2ed..9a0c0570ad2b7 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockopt.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/getsockopt.h
@@ -15,7 +15,6 @@
 #include "src/__support/macros/config.h"
 
 #include "hdr/types/socklen_t.h"
-#include <linux/net.h>   // For SYS_GETSOCKOPT socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
@@ -23,20 +22,8 @@ namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> getsockopt(int sockfd, int level, int optname,
                                     void *optval, socklen_t *optlen) {
-#ifdef SYS_getsockopt
   int ret =
       syscall_impl<int>(SYS_getsockopt, sockfd, level, optname, optval, optlen);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[5] = {static_cast<unsigned long>(sockfd),
-                                    static_cast<unsigned long>(level),
-                                    static_cast<unsigned long>(optname),
-                                    reinterpret_cast<unsigned long>(optval),
-                                    reinterpret_cast<unsigned long>(optlen)};
-  int ret = syscall_impl<int>(SYS_socketcall, SYS_GETSOCKOPT, sockcall_args);
-#else
-#error "getsockopt and socketcall syscalls unavailable for this platform."
-#endif
-
   if (ret < 0)
     return Error(-static_cast<int>(ret));
   return ret;
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/listen.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/listen.h
index 9de54ce0a9a9f..e764c6b108432 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/listen.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/listen.h
@@ -14,24 +14,13 @@
 #include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
-#include <linux/net.h>   // For SYS_LISTEN socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
 namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> listen(int sockfd, int backlog) {
-#ifdef SYS_listen
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_listen, sockfd, backlog);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[2] = {static_cast<unsigned long>(sockfd),
-                                    static_cast<unsigned long>(backlog)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_LISTEN,
-                                              sockcall_args);
-#else
-#error "listen and socketcall syscalls unavailable for this platform."
-#endif
-
+  int ret = syscall_impl<int>(SYS_listen, sockfd, backlog);
   if (ret < 0)
     return Error(-static_cast<int>(ret));
   return ret;
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/setsockopt.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/setsockopt.h
index d16c397bba6ac..f1cfa8cd8e562 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/setsockopt.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/setsockopt.h
@@ -15,7 +15,6 @@
 #include "src/__support/macros/config.h"
 
 #include "hdr/types/socklen_t.h"
-#include <linux/net.h>   // For SYS_SETSOCKOPT socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
@@ -23,20 +22,8 @@ namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> setsockopt(int sockfd, int level, int optname,
                                     const void *optval, socklen_t optlen) {
-#ifdef SYS_setsockopt
   int ret =
       syscall_impl<int>(SYS_setsockopt, sockfd, level, optname, optval, optlen);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[5] = {static_cast<unsigned long>(sockfd),
-                                    static_cast<unsigned long>(level),
-                                    static_cast<unsigned long>(optname),
-                                    reinterpret_cast<unsigned long>(optval),
-                                    static_cast<unsigned long>(optlen)};
-  int ret = syscall_impl<int>(SYS_socketcall, SYS_SETSOCKOPT, sockcall_args);
-#else
-#error "setsockopt and socketcall syscalls unavailable for this platform."
-#endif
-
   if (ret < 0)
     return Error(-static_cast<int>(ret));
   return ret;
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/shutdown.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/shutdown.h
index 2a9e92364f637..156905a408f1a 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/shutdown.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/shutdown.h
@@ -14,24 +14,13 @@
 #include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 
-#include <linux/net.h>   // For SYS_SHUTDOWN socketcall number.
 #include <sys/syscall.h> // For syscall numbers
 
 namespace LIBC_NAMESPACE_DECL {
 namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> shutdown(int sockfd, int how) {
-#ifdef SYS_shutdown
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_shutdown, sockfd, how);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[2] = {static_cast<unsigned long>(sockfd),
-                                    static_cast<unsigned long>(how)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_SHUTDOWN,
-                                              sockcall_args);
-#else
-#error "shutdown and socketcall syscalls unavailable for this platform."
-#endif
-
+  int ret = syscall_impl<int>(SYS_shutdown, sockfd, how);
   if (ret < 0)
     return Error(-static_cast<int>(ret));
   return ret;
diff --git a/libc/src/sys/socket/linux/bind.cpp b/libc/src/sys/socket/linux/bind.cpp
index 83a3d06f5380b..1b0a868f6b127 100644
--- a/libc/src/sys/socket/linux/bind.cpp
+++ b/libc/src/sys/socket/linux/bind.cpp
@@ -14,7 +14,6 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
@@ -22,18 +21,7 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(int, bind,
                    (int socket, const struct sockaddr *address,
                     socklen_t address_len)) {
-#ifdef SYS_bind
-  int ret =
-      LIBC_NAMESPACE::syscall_impl<int>(SYS_bind, socket, address, address_len);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {static_cast<unsigned long>(socket),
-                                    reinterpret_cast<unsigned long>(address),
-                                    static_cast<unsigned long>(address_len)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_BIND,
-                                              sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  int ret = syscall_impl<int>(SYS_bind, socket, address, address_len);
   if (ret < 0) {
     libc_errno = -ret;
     return -1;
diff --git a/libc/src/sys/socket/linux/recv.cpp b/libc/src/sys/socket/linux/recv.cpp
index baf4de1b5eb54..b7b208d454ffe 100644
--- a/libc/src/sys/socket/linux/recv.cpp
+++ b/libc/src/sys/socket/linux/recv.cpp
@@ -8,7 +8,6 @@
 
 #include "src/sys/socket/recv.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 #include "hdr/types/socklen_t.h"
@@ -24,19 +23,12 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(ssize_t, recv,
                    (int sockfd, void *buf, size_t len, int flags)) {
 #ifdef SYS_recv
-  ssize_t ret =
-      LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_recv, sockfd, buf, len, flags);
+  ssize_t ret = syscall_impl<ssize_t>(SYS_recv, sockfd, buf, len, flags);
 #elif defined(SYS_recvfrom)
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_recvfrom, sockfd, buf, len, flags, nullptr, nullptr);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[4] = {
-      static_cast<unsigned long>(sockfd), reinterpret_cast<unsigned long>(buf),
-      static_cast<unsigned long>(len), static_cast<unsigned long>(flags)};
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_socketcall, SYS_RECV,
-                                                      sockcall_args);
+  ssize_t ret = syscall_impl<ssize_t>(SYS_recvfrom, sockfd, buf, len, flags,
+                                      nullptr, nullptr);
 #else
-#error "socket and socketcall syscalls unavailable for this platform."
+#error "recv or recvfrom syscalls unavailable for this platform."
 #endif
   if (ret < 0) {
     libc_errno = static_cast<int>(-ret);
diff --git a/libc/src/sys/socket/linux/recvfrom.cpp b/libc/src/sys/socket/linux/recvfrom.cpp
index 3d8397b478cc4..ff4d5494fbeff 100644
--- a/libc/src/sys/socket/linux/recvfrom.cpp
+++ b/libc/src/sys/socket/linux/recvfrom.cpp
@@ -8,7 +8,6 @@
 
 #include "src/sys/socket/recvfrom.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 #include "hdr/types/socklen_t.h"
@@ -34,21 +33,8 @@ LLVM_LIBC_FUNCTION(ssize_t, recvfrom,
     srcaddr_sz = *addrlen;
   (void)srcaddr_sz; // prevent "set but not used" warning
 
-#ifdef SYS_recvfrom
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_recvfrom, sockfd, buf, len, flags, src_addr, addrlen);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[6] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(buf),
-                                    static_cast<unsigned long>(len),
-                                    static_cast<unsigned long>(flags),
-                                    reinterpret_cast<unsigned long>(src_addr),
-                                    static_cast<unsigned long>(addrlen)};
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_socketcall, SYS_RECVFROM, sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  ssize_t ret = syscall_impl<ssize_t>(SYS_recvfrom, sockfd, buf, len, flags,
+                                      src_addr, addrlen);
   if (ret < 0) {
     libc_errno = static_cast<int>(-ret);
     return -1;
diff --git a/libc/src/sys/socket/linux/recvmsg.cpp b/libc/src/sys/socket/linux/recvmsg.cpp
index bc6d072dbf9a1..e7650c508115c 100644
--- a/libc/src/sys/socket/linux/recvmsg.cpp
+++ b/libc/src/sys/socket/linux/recvmsg.cpp
@@ -8,7 +8,6 @@
 
 #include "src/sys/socket/recvmsg.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 #include "hdr/types/ssize_t.h"
@@ -21,18 +20,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(ssize_t, recvmsg, (int sockfd, msghdr *msg, int flags)) {
-#ifdef SYS_recvmsg
-  ssize_t ret =
-      LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_recvmsg, sockfd, msg, flags);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(msg),
-                                    static_cast<unsigned long>(flags)};
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_socketcall, SYS_RECVMSG, sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  ssize_t ret = syscall_impl<ssize_t>(SYS_recvmsg, sockfd, msg, flags);
   if (ret < 0) {
     libc_errno = static_cast<int>(-ret);
     return -1;
diff --git a/libc/src/sys/socket/linux/send.cpp b/libc/src/sys/socket/linux/send.cpp
index 43b01e7e6e0f6..7e63e9e716433 100644
--- a/libc/src/sys/socket/linux/send.cpp
+++ b/libc/src/sys/socket/linux/send.cpp
@@ -8,7 +8,6 @@
 
 #include "src/sys/socket/send.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 #include "hdr/types/socklen_t.h"
@@ -23,19 +22,12 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(ssize_t, send,
                    (int sockfd, const void *buf, size_t len, int flags)) {
 #ifdef SYS_send
-  ssize_t ret =
-      LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_send, sockfd, buf, len, flags);
+  ssize_t ret = syscall_impl<ssize_t>(SYS_send, sockfd, buf, len, flags);
 #elif defined(SYS_sendto)
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_sendto, sockfd, buf,
-                                                      len, flags, nullptr, 0);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[4] = {
-      static_cast<unsigned long>(sockfd), reinterpret_cast<unsigned long>(buf),
-      static_cast<unsigned long>(len), static_cast<unsigned long>(flags)};
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_socketcall, SYS_SEND,
-                                                      sockcall_args);
+  ssize_t ret =
+      syscall_impl<ssize_t>(SYS_sendto, sockfd, buf, len, flags, nullptr, 0);
 #else
-#error "socket and socketcall syscalls unavailable for this platform."
+#error "send or sendto syscalls unavailable for this platform."
 #endif
   if (ret < 0) {
     libc_errno = static_cast<int>(-ret);
diff --git a/libc/src/sys/socket/linux/sendmsg.cpp b/libc/src/sys/socket/linux/sendmsg.cpp
index b04783ebfe7e7..b4bbd7f78d433 100644
--- a/libc/src/sys/socket/linux/sendmsg.cpp
+++ b/libc/src/sys/socket/linux/sendmsg.cpp
@@ -8,7 +8,6 @@
 
 #include "src/sys/socket/sendmsg.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 #include "hdr/types/ssize_t.h"
@@ -21,18 +20,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(ssize_t, sendmsg,
                    (int sockfd, const struct msghdr *msg, int flags)) {
-#ifdef SYS_sendmsg
-  ssize_t ret =
-      LIBC_NAMESPACE::syscall_impl<ssize_t>(SYS_sendmsg, sockfd, msg, flags);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(msg),
-                                    static_cast<unsigned long>(flags)};
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_socketcall, SYS_SENDMSG, sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  ssize_t ret = syscall_impl<ssize_t>(SYS_sendmsg, sockfd, msg, flags);
   if (ret < 0) {
     libc_errno = static_cast<int>(-ret);
     return -1;
diff --git a/libc/src/sys/socket/linux/sendto.cpp b/libc/src/sys/socket/linux/sendto.cpp
index 9dda127f872d5..48450e22e9f48 100644
--- a/libc/src/sys/socket/linux/sendto.cpp
+++ b/libc/src/sys/socket/linux/sendto.cpp
@@ -8,7 +8,6 @@
 
 #include "src/sys/socket/sendto.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 #include "hdr/types/socklen_t.h"
@@ -23,21 +22,8 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(ssize_t, sendto,
                    (int sockfd, const void *buf, size_t len, int flags,
                     const struct sockaddr *dest_addr, socklen_t addrlen)) {
-#ifdef SYS_sendto
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_sendto, sockfd, buf, len, flags, dest_addr, addrlen);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[6] = {static_cast<unsigned long>(sockfd),
-                                    reinterpret_cast<unsigned long>(buf),
-                                    static_cast<unsigned long>(len),
-                                    static_cast<unsigned long>(flags),
-                                    reinterpret_cast<unsigned long>(dest_addr),
-                                    static_cast<unsigned long>(addrlen)};
-  ssize_t ret = LIBC_NAMESPACE::syscall_impl<ssize_t>(
-      SYS_socketcall, SYS_SENDTO, sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  ssize_t ret = syscall_impl<ssize_t>(SYS_sendto, sockfd, buf, len, flags,
+                                      dest_addr, addrlen);
   if (ret < 0) {
     libc_errno = static_cast<int>(-ret);
     return -1;
diff --git a/libc/src/sys/socket/linux/socket.cpp b/libc/src/sys/socket/linux/socket.cpp
index 69eb6cfa01ced..a2da75a0d7be0 100644
--- a/libc/src/sys/socket/linux/socket.cpp
+++ b/libc/src/sys/socket/linux/socket.cpp
@@ -14,24 +14,12 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, socket, (int domain, int type, int protocol)) {
-#ifdef SYS_socket
-  int ret =
-      LIBC_NAMESPACE::syscall_impl<int>(SYS_socket, domain, type, protocol);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {static_cast<unsigned long>(domain),
-                                    static_cast<unsigned long>(type),
-                                    static_cast<unsigned long>(protocol)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_SOCKET,
-                                              sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  int ret = syscall_impl<int>(SYS_socket, domain, type, protocol);
   if (ret < 0) {
     libc_errno = -ret;
     return -1;
diff --git a/libc/src/sys/socket/linux/socketpair.cpp b/libc/src/sys/socket/linux/socketpair.cpp
index 7ea8ca46cee58..a17850a3468fa 100644
--- a/libc/src/sys/socket/linux/socketpair.cpp
+++ b/libc/src/sys/socket/linux/socketpair.cpp
@@ -13,25 +13,13 @@
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/sanitizer.h"
-#include <linux/net.h>   // For SYS_SOCKET socketcall number.
 #include <sys/syscall.h> // For syscall numbers.
 
 namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(int, socketpair,
                    (int domain, int type, int protocol, int sv[2])) {
-#ifdef SYS_socketpair
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketpair, domain, type,
-                                              protocol, sv);
-#elif defined(SYS_socketcall)
-  unsigned long sockcall_args[3] = {
-      static_cast<unsigned long>(domain), static_cast<unsigned long>(type),
-      static_cast<unsigned long>(protocol), static_cast<unsigned long>(sv)};
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_socketcall, SYS_SOCKETPAIR,
-                                              sockcall_args);
-#else
-#error "socket and socketcall syscalls unavailable for this platform."
-#endif
+  int ret = syscall_impl<int>(SYS_socketpair, domain, type, protocol, sv);
   if (ret < 0) {
     libc_errno = -ret;
     return -1;

From c5695b8f60a57caa46491ce78c7be9e336ad7fe9 Mon Sep 17 00:00:00 2001
From: Kiriti Ponduri <123718855+udaykiriti@users.noreply.github.com>
Date: Thu, 14 May 2026 18:12:46 +0530
Subject: [PATCH 69/95] [libc] prefer *at syscalls in sys/stat wrappers
 (#195792)

  - so the changes flips the #ifdef order to prefer the *at syscalls over normal ones.
  - In modern architectures, *at system calls are preferred over normal
    system calls.
  - so by checking for "*at" sys calls first, we ensure better
    compatibility with modern systems.
  - then normal syscalls moved to else for support of older ones.

Signed-off-by: udaykiriti <udaykiriti624@gmail.com>
---
 libc/src/__support/OSUtil/linux/syscall_wrappers/chmod.h | 6 +++---
 libc/src/__support/OSUtil/linux/syscall_wrappers/mkdir.h | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/chmod.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/chmod.h
index 69acf4c5cf1b3..42b0d3db98326 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/chmod.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/chmod.h
@@ -26,13 +26,13 @@ namespace LIBC_NAMESPACE_DECL {
 namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> chmod(const char *path, mode_t mode) {
-#ifdef SYS_chmod
-  int ret = syscall_impl<int>(SYS_chmod, path, mode);
-#elif defined(SYS_fchmodat)
+#ifdef SYS_fchmodat
   int ret = syscall_impl<int>(SYS_fchmodat, AT_FDCWD, path, mode, 0);
 #elif defined(SYS_fchmodat2)
   int ret = syscall_impl<int>(SYS_fchmodat2, AT_FDCWD, path, mode, 0,
                               AT_SYMLINK_NOFOLLOW);
+#elif defined(SYS_chmod)
+  int ret = syscall_impl<int>(SYS_chmod, path, mode);
 #else
 #error "chmod, fchmodat and fchmodat2 syscalls not available."
 #endif
diff --git a/libc/src/__support/OSUtil/linux/syscall_wrappers/mkdir.h b/libc/src/__support/OSUtil/linux/syscall_wrappers/mkdir.h
index 6d77894c36b6b..b5002914c5ec0 100644
--- a/libc/src/__support/OSUtil/linux/syscall_wrappers/mkdir.h
+++ b/libc/src/__support/OSUtil/linux/syscall_wrappers/mkdir.h
@@ -26,10 +26,10 @@ namespace LIBC_NAMESPACE_DECL {
 namespace linux_syscalls {
 
 LIBC_INLINE ErrorOr<int> mkdir(const char *path, mode_t mode) {
-#ifdef SYS_mkdir
-  int ret = syscall_impl<int>(SYS_mkdir, path, mode);
-#else
+#ifdef SYS_mkdirat
   int ret = syscall_impl<int>(SYS_mkdirat, AT_FDCWD, path, mode);
+#else
+  int ret = syscall_impl<int>(SYS_mkdir, path, mode);
 #endif
   if (ret < 0)
     return Error(-ret);

From b5d577d3faef34276991fe80b5f869e3f8ef7442 Mon Sep 17 00:00:00 2001
From: Garvit Gupta <garvgupt@qti.qualcomm.com>
Date: Thu, 14 May 2026 18:27:58 +0530
Subject: [PATCH 70/95] [RISCV] Check SP-relative offset in needsFrameBaseReg
 when FP offset overflows (#197368)

When a frame pointer is present, `needsFrameBaseReg` previously only
checked the FP-relative offset to decide if a virtual base register was
needed. If the worst-case FP offset exceeded the 12-bit immediate range,
a base register was always materialized, even when the SP-relative
offset would fit.

Since `getFrameIndexReference` can now select SP over FP when the offset
fits in the compressed instruction immediate range, also check the
SP-relative offset before deciding a base register is needed. This
avoids unnecessary base register materialization and results in some
code size savings.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   |  8 ++++-
 llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll  | 35 +++++++++----------
 .../RISCV/local-stack-slot-allocation.ll      |  5 ++-
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 6baa30cf9e6f6..30441eaee87f0 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -725,7 +725,13 @@ bool RISCVRegisterInfo::needsFrameBaseReg(MachineInstr *MI,
     }
 
     int64_t MaxFPOffset = Offset - CalleeSavedSize;
-    return !isFrameOffsetLegal(MI, RISCV::X8, MaxFPOffset);
+    if (isFrameOffsetLegal(MI, RISCV::X8, MaxFPOffset))
+      return false;
+
+    // If the FP-relative offset doesn't fit, fall through to check the
+    // SP-relative offset. getFrameIndexReference may select SP over FP when
+    // the SP offset fits in the compressed instruction immediate range, so a
+    // base register might not be needed.
   }
 
   // Assume 128 bytes spill slots size to estimate the maximum possible
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
index 35368efbb659a..3d2ec63096251 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/vararg.ll
@@ -1670,20 +1670,19 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV32-WITHFP-NEXT:    lui a0, 24414
 ; RV32-WITHFP-NEXT:    addi a0, a0, -1728
 ; RV32-WITHFP-NEXT:    sub sp, sp, a0
-; RV32-WITHFP-NEXT:    mv a0, sp
 ; RV32-WITHFP-NEXT:    sw a1, 4(s0)
 ; RV32-WITHFP-NEXT:    sw a2, 8(s0)
 ; RV32-WITHFP-NEXT:    sw a3, 12(s0)
 ; RV32-WITHFP-NEXT:    sw a4, 16(s0)
-; RV32-WITHFP-NEXT:    addi a1, s0, 4
-; RV32-WITHFP-NEXT:    sw a1, 0(a0)
-; RV32-WITHFP-NEXT:    lw a1, 0(a0)
+; RV32-WITHFP-NEXT:    addi a0, s0, 4
+; RV32-WITHFP-NEXT:    sw a0, 0(sp)
+; RV32-WITHFP-NEXT:    lw a0, 0(sp)
 ; RV32-WITHFP-NEXT:    sw a5, 20(s0)
 ; RV32-WITHFP-NEXT:    sw a6, 24(s0)
 ; RV32-WITHFP-NEXT:    sw a7, 28(s0)
-; RV32-WITHFP-NEXT:    addi a2, a1, 4
-; RV32-WITHFP-NEXT:    sw a2, 0(a0)
-; RV32-WITHFP-NEXT:    lw a0, 0(a1)
+; RV32-WITHFP-NEXT:    addi a1, a0, 4
+; RV32-WITHFP-NEXT:    sw a1, 0(sp)
+; RV32-WITHFP-NEXT:    lw a0, 0(a0)
 ; RV32-WITHFP-NEXT:    lui a1, 24414
 ; RV32-WITHFP-NEXT:    addi a1, a1, -1728
 ; RV32-WITHFP-NEXT:    add sp, sp, a1
@@ -1709,25 +1708,25 @@ define i32 @va_large_stack(ptr %fmt, ...) {
 ; RV64-WITHFP-NEXT:    lui a0, 24414
 ; RV64-WITHFP-NEXT:    addi a0, a0, -1680
 ; RV64-WITHFP-NEXT:    sub sp, sp, a0
-; RV64-WITHFP-NEXT:    mv a0, sp
 ; RV64-WITHFP-NEXT:    sd a1, 8(s0)
 ; RV64-WITHFP-NEXT:    sd a2, 16(s0)
 ; RV64-WITHFP-NEXT:    sd a3, 24(s0)
 ; RV64-WITHFP-NEXT:    sd a4, 32(s0)
+; RV64-WITHFP-NEXT:    mv a0, sp
 ; RV64-WITHFP-NEXT:    addi a1, s0, 8
-; RV64-WITHFP-NEXT:    sd a1, 0(a0)
-; RV64-WITHFP-NEXT:    lwu a1, 0(a0)
-; RV64-WITHFP-NEXT:    lw a2, 4(a0)
+; RV64-WITHFP-NEXT:    sd a1, 0(sp)
+; RV64-WITHFP-NEXT:    lw a0, 4(a0)
+; RV64-WITHFP-NEXT:    lwu a1, 0(sp)
 ; RV64-WITHFP-NEXT:    sd a5, 40(s0)
 ; RV64-WITHFP-NEXT:    sd a6, 48(s0)
 ; RV64-WITHFP-NEXT:    sd a7, 56(s0)
-; RV64-WITHFP-NEXT:    slli a2, a2, 32
-; RV64-WITHFP-NEXT:    or a1, a2, a1
-; RV64-WITHFP-NEXT:    addi a2, a1, 4
-; RV64-WITHFP-NEXT:    srli a3, a2, 32
-; RV64-WITHFP-NEXT:    sw a2, 0(a0)
-; RV64-WITHFP-NEXT:    sw a3, 4(a0)
-; RV64-WITHFP-NEXT:    lw a0, 0(a1)
+; RV64-WITHFP-NEXT:    slli a0, a0, 32
+; RV64-WITHFP-NEXT:    or a0, a0, a1
+; RV64-WITHFP-NEXT:    addi a1, a0, 4
+; RV64-WITHFP-NEXT:    srli a2, a1, 32
+; RV64-WITHFP-NEXT:    sw a1, 0(sp)
+; RV64-WITHFP-NEXT:    sw a2, 4(sp)
+; RV64-WITHFP-NEXT:    lw a0, 0(a0)
 ; RV64-WITHFP-NEXT:    lui a1, 24414
 ; RV64-WITHFP-NEXT:    addi a1, a1, -1680
 ; RV64-WITHFP-NEXT:    add sp, sp, a1
diff --git a/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll b/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll
index 1ad78f4112351..9a535943d837d 100644
--- a/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll
+++ b/llvm/test/CodeGen/RISCV/local-stack-slot-allocation.ll
@@ -153,9 +153,8 @@ define void @frame_pointer() "frame-pointer"="all" {
 ; RV64I-NEXT:    addi s0, sp, 2032
 ; RV64I-NEXT:    .cfi_def_cfa s0, 0
 ; RV64I-NEXT:    addi sp, sp, -496
-; RV64I-NEXT:    addi a0, sp, 556
-; RV64I-NEXT:    lbu a1, 0(a0)
-; RV64I-NEXT:    sb a1, 0(a0)
+; RV64I-NEXT:    lbu a0, 556(sp)
+; RV64I-NEXT:    sb a0, 556(sp)
 ; RV64I-NEXT:    addi sp, sp, 496
 ; RV64I-NEXT:    .cfi_def_cfa sp, 2032
 ; RV64I-NEXT:    ld ra, 2024(sp) # 8-byte Folded Reload

From 25c89996ab220146e1095a1a0ab6af12ebf71f58 Mon Sep 17 00:00:00 2001
From: lntue <lntue@google.com>
Date: Thu, 14 May 2026 09:00:45 -0400
Subject: [PATCH 71/95] [libc] Fix shared math for gcc-7 or older
 compatibility. (#197476)

- Add gcc-7 or older compatibility for cpp::is_assignable and
cpp::is_constructible.
- Apply LIBC_CONSTEXPR to FPUtil/rounding_mode.h
---
 libc/src/__support/CPP/CMakeLists.txt         |  2 +
 .../__support/CPP/type_traits/is_assignable.h | 63 ++++++++++++++++++
 .../CPP/type_traits/is_constructible.h        | 65 +++++++++++++++++++
 .../CPP/type_traits/is_copy_assignable.h      |  7 +-
 .../CPP/type_traits/is_copy_constructible.h   |  5 +-
 .../CPP/type_traits/is_move_assignable.h      |  7 +-
 .../CPP/type_traits/is_move_constructible.h   |  5 +-
 libc/src/__support/FPUtil/rounding_mode.h     | 10 +--
 libc/src/__support/macros/attributes.h        | 18 +++++
 libc/src/__support/macros/config.h            |  2 +
 10 files changed, 165 insertions(+), 19 deletions(-)
 create mode 100644 libc/src/__support/CPP/type_traits/is_assignable.h
 create mode 100644 libc/src/__support/CPP/type_traits/is_constructible.h

diff --git a/libc/src/__support/CPP/CMakeLists.txt b/libc/src/__support/CPP/CMakeLists.txt
index 03e21ac6c80ca..b602aa1f79d08 100644
--- a/libc/src/__support/CPP/CMakeLists.txt
+++ b/libc/src/__support/CPP/CMakeLists.txt
@@ -130,11 +130,13 @@ add_header_library(
     type_traits/invoke.h
     type_traits/is_arithmetic.h
     type_traits/is_array.h
+    type_traits/is_assignable.h
     type_traits/is_base_of.h
     type_traits/is_class.h
     type_traits/is_complex.h
     type_traits/is_const.h
     type_traits/is_constant_evaluated.h
+    type_traits/is_constructible.h
     type_traits/is_convertible.h
     type_traits/is_destructible.h
     type_traits/is_enum.h
diff --git a/libc/src/__support/CPP/type_traits/is_assignable.h b/libc/src/__support/CPP/type_traits/is_assignable.h
new file mode 100644
index 0000000000000..0be3aa500590d
--- /dev/null
+++ b/libc/src/__support/CPP/type_traits/is_assignable.h
@@ -0,0 +1,63 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file contains a free-standing implementation of is_assignable
+// type trait.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_ASSIGNABLE_H
+#define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_ASSIGNABLE_H
+
+#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/utility/declval.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace cpp {
+
+namespace is_assignable_detail {
+
+#if LIBC_HAS_BUILTIN_IS_ASSIGNABLE
+
+template <typename T, typename U>
+struct is_assignable_impl : public bool_constant<__is_assignable(T, U)> {};
+
+#else
+// Fallback SFINAE implementation for GCC 7 and older toolchains
+
+template <typename T, typename U> struct is_assignable_impl {
+private:
+  template <typename T1, typename U1>
+  LIBC_INLINE static auto test(int)
+      -> decltype(declval<T1>() = declval<U1>(), bool_constant<true>());
+
+  template <typename, typename>
+  LIBC_INLINE static auto test(...) -> bool_constant<false>;
+
+public:
+  using type = decltype(test<T, U>(0));
+};
+
+#endif // LIBC_HAS_BUILTIN_IS_ASSIGNABLE
+
+} // namespace is_assignable_detail
+
+// is_assignable
+template <typename T, typename U>
+struct is_assignable : public is_assignable_detail::is_assignable_impl<T, U> {};
+
+template <typename T, typename U>
+LIBC_INLINE_VAR constexpr bool is_assignable_v = is_assignable<T, U>::value;
+
+} // namespace cpp
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_ASSIGNABLE_H
diff --git a/libc/src/__support/CPP/type_traits/is_constructible.h b/libc/src/__support/CPP/type_traits/is_constructible.h
new file mode 100644
index 0000000000000..316ba1acba33e
--- /dev/null
+++ b/libc/src/__support/CPP/type_traits/is_constructible.h
@@ -0,0 +1,65 @@
+//===------------------------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// This file contains a free-standing implementation of is_constructible
+// type trait.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTRUCTIBLE_H
+#define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTRUCTIBLE_H
+
+#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/utility/declval.h"
+#include "src/__support/macros/attributes.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+namespace cpp {
+
+namespace is_contructible_detail {
+
+#if LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE
+
+template <typename T, typename... Args>
+struct is_constructible_impl
+    : public bool_constant<__is_constructible(T, Args...)> {};
+
+#else
+// Fallback SFINAE implementation for GCC 7 and older toolchains
+
+template <typename T, typename... Args> struct is_constructible_impl {
+private:
+  template <typename T1, typename... Args1>
+  LIBC_INLINE static auto test(int)
+      -> decltype(T1(declval<Args1>()...), bool_constant<true>());
+
+  template <typename, typename...>
+  LIBC_INLINE static auto test(...) -> bool_constant<false>;
+
+public:
+  using type = decltype(test<T, Args...>(0));
+};
+
+#endif // LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE
+
+} // namespace is_contructible_detail
+
+template <typename T, typename... Args>
+struct is_constructible
+    : public is_contructible_detail::is_constructible_impl<T, Args...> {};
+
+template <typename T, typename... Args>
+LIBC_INLINE_VAR constexpr bool is_constructible_v =
+    is_constructible<T, Args...>::value;
+
+} // namespace cpp
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTRUCTIBLE_H
diff --git a/libc/src/__support/CPP/type_traits/is_copy_assignable.h b/libc/src/__support/CPP/type_traits/is_copy_assignable.h
index 9beb93d14668d..97c30c668a4e3 100644
--- a/libc/src/__support/CPP/type_traits/is_copy_assignable.h
+++ b/libc/src/__support/CPP/type_traits/is_copy_assignable.h
@@ -9,7 +9,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_COPY_ASSIGNABLE_H
 
 #include "src/__support/CPP/type_traits/add_lvalue_reference.h"
-#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/type_traits/is_assignable.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -18,9 +18,8 @@ namespace cpp {
 // is copy assignable
 template <class T>
 struct is_copy_assignable
-    : public integral_constant<
-          bool, __is_assignable(cpp::add_lvalue_reference_t<T>,
-                                cpp::add_lvalue_reference_t<const T>)> {};
+    : public cpp::is_assignable<cpp::add_lvalue_reference_t<T>,
+                                cpp::add_lvalue_reference_t<const T>> {};
 
 template <class T>
 LIBC_INLINE_VAR constexpr bool is_copy_assignable_v =
diff --git a/libc/src/__support/CPP/type_traits/is_copy_constructible.h b/libc/src/__support/CPP/type_traits/is_copy_constructible.h
index d8eb9ad3507ee..c62db8f69b680 100644
--- a/libc/src/__support/CPP/type_traits/is_copy_constructible.h
+++ b/libc/src/__support/CPP/type_traits/is_copy_constructible.h
@@ -9,7 +9,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_COPY_CONSTRUCTIBLE_H
 
 #include "src/__support/CPP/type_traits/add_lvalue_reference.h"
-#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/type_traits/is_constructible.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -18,8 +18,7 @@ namespace cpp {
 // is copy constructible
 template <class T>
 struct is_copy_constructible
-    : public integral_constant<
-          bool, __is_constructible(T, cpp::add_lvalue_reference_t<const T>)> {};
+    : public cpp::is_constructible<T, cpp::add_lvalue_reference_t<const T>> {};
 
 template <class T>
 LIBC_INLINE_VAR constexpr bool is_copy_constructible_v =
diff --git a/libc/src/__support/CPP/type_traits/is_move_assignable.h b/libc/src/__support/CPP/type_traits/is_move_assignable.h
index a788bd9074e32..edffe094c58a3 100644
--- a/libc/src/__support/CPP/type_traits/is_move_assignable.h
+++ b/libc/src/__support/CPP/type_traits/is_move_assignable.h
@@ -10,7 +10,7 @@
 
 #include "src/__support/CPP/type_traits/add_lvalue_reference.h"
 #include "src/__support/CPP/type_traits/add_rvalue_reference.h"
-#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/type_traits/is_assignable.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -19,9 +19,8 @@ namespace cpp {
 // is move assignable
 template <class T>
 struct is_move_assignable
-    : public integral_constant<bool, __is_assignable(
-                                         cpp::add_lvalue_reference_t<T>,
-                                         cpp::add_rvalue_reference_t<T>)> {};
+    : public cpp::is_assignable<cpp::add_lvalue_reference_t<T>,
+                                cpp::add_rvalue_reference_t<T>> {};
 
 template <class T>
 LIBC_INLINE_VAR constexpr bool is_move_assignable_v =
diff --git a/libc/src/__support/CPP/type_traits/is_move_constructible.h b/libc/src/__support/CPP/type_traits/is_move_constructible.h
index c898960546258..37d540c27a5f6 100644
--- a/libc/src/__support/CPP/type_traits/is_move_constructible.h
+++ b/libc/src/__support/CPP/type_traits/is_move_constructible.h
@@ -9,7 +9,7 @@
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_MOVE_CONSTRUCTIBLE_H
 
 #include "src/__support/CPP/type_traits/add_rvalue_reference.h"
-#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/type_traits/is_constructible.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
@@ -18,8 +18,7 @@ namespace cpp {
 // is move constructible
 template <class T>
 struct is_move_constructible
-    : public integral_constant<bool, __is_constructible(
-                                         T, cpp::add_rvalue_reference_t<T>)> {};
+    : public cpp::is_constructible<T, cpp::add_rvalue_reference_t<T>> {};
 
 template <class T>
 LIBC_INLINE_VAR constexpr bool is_move_constructible_v =
diff --git a/libc/src/__support/FPUtil/rounding_mode.h b/libc/src/__support/FPUtil/rounding_mode.h
index 92061ea13e203..ebeb3652f64aa 100644
--- a/libc/src/__support/FPUtil/rounding_mode.h
+++ b/libc/src/__support/FPUtil/rounding_mode.h
@@ -80,7 +80,7 @@ LIBC_INLINE int quick_get_round() {
 
 } // namespace generic
 
-LIBC_INLINE constexpr bool fenv_is_round_up() {
+LIBC_INLINE LIBC_CONSTEXPR bool fenv_is_round_up() {
   if (cpp::is_constant_evaluated()) {
     return false;
   } else {
@@ -88,7 +88,7 @@ LIBC_INLINE constexpr bool fenv_is_round_up() {
   }
 }
 
-LIBC_INLINE constexpr bool fenv_is_round_down() {
+LIBC_INLINE LIBC_CONSTEXPR bool fenv_is_round_down() {
   if (cpp::is_constant_evaluated()) {
     return false;
   } else {
@@ -96,7 +96,7 @@ LIBC_INLINE constexpr bool fenv_is_round_down() {
   }
 }
 
-LIBC_INLINE constexpr bool fenv_is_round_to_nearest() {
+LIBC_INLINE LIBC_CONSTEXPR bool fenv_is_round_to_nearest() {
   if (cpp::is_constant_evaluated()) {
     return true;
   } else {
@@ -104,7 +104,7 @@ LIBC_INLINE constexpr bool fenv_is_round_to_nearest() {
   }
 }
 
-LIBC_INLINE constexpr bool fenv_is_round_to_zero() {
+LIBC_INLINE LIBC_CONSTEXPR bool fenv_is_round_to_zero() {
   if (cpp::is_constant_evaluated()) {
     return false;
   } else {
@@ -113,7 +113,7 @@ LIBC_INLINE constexpr bool fenv_is_round_to_zero() {
 }
 
 // Quick free standing get rounding mode based on the above observations.
-LIBC_INLINE constexpr int quick_get_round() {
+LIBC_INLINE LIBC_CONSTEXPR int quick_get_round() {
   if (cpp::is_constant_evaluated()) {
     return FE_TONEAREST;
   } else {
diff --git a/libc/src/__support/macros/attributes.h b/libc/src/__support/macros/attributes.h
index 7ec708b75e897..1cd3a309d3cf9 100644
--- a/libc/src/__support/macros/attributes.h
+++ b/libc/src/__support/macros/attributes.h
@@ -58,6 +58,24 @@
 #define LIBC_CONSTEXPR
 #endif
 
+#ifndef LIBC_HAS_BUILTIN_IS_ASSIGNABLE
+#if (__has_builtin(__is_assignable) ||                                         \
+     (defined(LIBC_COMPILER_IS_GCC) && (LIBC_COMPILER_GCC_VER >= 800)))
+#define LIBC_HAS_BUILTIN_IS_ASSIGNABLE 1
+#else
+#define LIBC_HAS_BUILTIN_IS_ASSIGNABLE 0
+#endif
+#endif // LIBC_HAS_BUILTIN_IS_ASSIGNABLE
+
+#ifndef LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE
+#if (__has_builtin(__is_constructible) ||                                      \
+     (defined(LIBC_COMPILER_IS_GCC) && (LIBC_COMPILER_GCC_VER >= 800)))
+#define LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE 1
+#else
+#define LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE 0
+#endif
+#endif // LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE
+
 // Uses the platform specific specialization
 #define LIBC_THREAD_MODE_PLATFORM 0
 
diff --git a/libc/src/__support/macros/config.h b/libc/src/__support/macros/config.h
index d9c80423f499d..38ad605473844 100644
--- a/libc/src/__support/macros/config.h
+++ b/libc/src/__support/macros/config.h
@@ -49,6 +49,8 @@
 #define __builtin_prefetch(X, Y, Z)
 
 #define LIBC_HAS_BUILTIN_IS_CONSTANT_EVALUATED 1
+#define LIBC_HAS_BUILTIN_IS_ASSIGNABLE 1
+#define LIBC_HAS_BUILTIN_IS_CONSTRUCTIBLE 1
 
 #endif // LIBC_COMPILER_IS_MSVC
 

From d7f8673b4fc6f5d50de981f490491e60cfd048f7 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Thu, 14 May 2026 09:05:51 -0400
Subject: [PATCH 72/95] [NFC][Analysis] Use `isa<ConstantPointerNull>` for null
 pointer checks (#197544)

Make Analysis null pointer checks use `isa<ConstantPointerNull>` rather
than generic null value checks (`isNullValue()`).
---
 llvm/lib/Analysis/GlobalsModRef.cpp | 5 +++--
 llvm/lib/Analysis/LazyValueInfo.cpp | 4 ++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp
index a0d75334f0f2b..a9d559ad7e27d 100644
--- a/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -415,9 +416,9 @@ bool GlobalsAAResult::AnalyzeIndirectGlobalMemory(GlobalVariable *GV) {
   // value produced by the noalias call and any casts.
   std::vector<Value *> AllocRelatedValues;
 
-  // If the initializer is a valid pointer, bail.
+  // If the initializer is a non-null pointer, bail.
   if (Constant *C = GV->getInitializer())
-    if (!C->isNullValue())
+    if (!isa<ConstantPointerNull>(C))
       return false;
 
   // Walk the user list of the global.  If we find anything other than a direct
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index b70c380dd5466..e78a47b8de68d 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -2115,6 +2115,10 @@ Constant *LazyValueInfo::getPredicateAt(CmpInst::Predicate Pred, Value *V,
   // return it quickly. But this is only a fastpath, and falling
   // through would still be correct.
   const DataLayout &DL = CxtI->getDataLayout();
+  // NOTE: This check is meant to determine whether a pointer is semantically a
+  // null pointer, not just whether its value equals ConstantPointerNull. If the
+  // semantics of ConstantPointerNull change in the future, this should be
+  // updated to use a semantic check (e.g. isKnownNonNull).
   if (V->getType()->isPointerTy() && C->isNullValue() &&
       isKnownNonZero(V->stripPointerCastsSameRepresentation(), DL)) {
     Type *ResTy = CmpInst::makeCmpResultType(C->getType());

From 5f5435e7e80fca80e1ee95aec8cf6343c0fa78de Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Thu, 14 May 2026 15:17:52 +0200
Subject: [PATCH 73/95] AMDGPU/GlobalISel: Legalize scalar extloads with large
 memory type (#197648)

Add narrowScalar for scalar sext/zextload when the memory type is
larger then 32 bits. There is no narrow scalar implementation when
NarrowSize < MemSize (split load) but we don't want that anyway.
Narrow scalar to MemSize creates large normal load + extension to dst.
---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  7 +++
 .../GlobalISel/legalize-sextload-global.mir   | 15 ++++--
 .../GlobalISel/legalize-zextload-global.mir   | 13 +++--
 .../CodeGen/AMDGPU/GlobalISel/zextload.ll     | 52 +++++++++++++++++++
 4 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 1a0e4f2eaa416..10469477e6a8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1750,6 +1750,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   // inserting addrspacecasts.
   ExtLoads.customIf(typeIs(1, Constant32Ptr));
 
+  ExtLoads.narrowScalarIf(
+      [](const LegalityQuery &Query) {
+        LLT MemTy = Query.MMODescrs[0].MemoryTy;
+        return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
+               Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
+      }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
+      getScalarTypeFromMemDesc(0));
   ExtLoads.clampScalar(0, S32, S32)
           .widenScalarToNextPow2(0)
           .lower();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir
index 477239aee57f8..2875eec7cb980 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sextload-global.mir
@@ -11,7 +11,6 @@
 # ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(<2 x s32>) = G_SEXTLOAD %0:_(p1) :: (load (<2 x s16>), addrspace 1) (in function: test_sextload_global_v2i32_from_v2s16)
 # ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(<2 x s64>) = G_SEXTLOAD %0:_(p1) :: (load (<2 x s16>), addrspace 1) (in function: test_sextload_global_v2i64_from_v2s16)
 # ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(<2 x s64>) = G_SEXTLOAD %0:_(p1) :: (load (<2 x s32>), addrspace 1) (in function: test_sextload_global_v2i64_from_v2s32)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(s128) = G_SEXTLOAD %0:_(p1) :: (load (s64), addrspace 1) (in function: test_sextload_global_s128_8)
 # ERR-NOT: remark
 
 ---
@@ -492,15 +491,21 @@ body: |
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s128) = G_SEXTLOAD [[COPY]](p1) :: (load (s64), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[SEXTLOAD]](s128)
+    ; GFX8-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
+    ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[LOAD]], [[C]](s32)
+    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[ASHR]](s64)
+    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     ;
     ; GFX6-LABEL: name: test_sextload_global_s128_8
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[SEXTLOAD:%[0-9]+]]:_(s128) = G_SEXTLOAD [[COPY]](p1) :: (load (s64), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[SEXTLOAD]](s128)
+    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
+    ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[LOAD]], [[C]](s32)
+    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[ASHR]](s64)
+    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_SEXTLOAD %0 :: (load (s64), addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir
index 088647eab56d4..ffe6a4786f1ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-zextload-global.mir
@@ -11,7 +11,6 @@
 # ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(<2 x s32>) = G_ZEXTLOAD %0:_(p1) :: (load (<2 x s16>), addrspace 1) (in function: test_zextload_global_v2i32_from_4)
 # ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(<2 x s64>) = G_ZEXTLOAD %0:_(p1) :: (load (<2 x s16>), addrspace 1) (in function: test_zextload_global_v2i64_from_4)
 # ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(<2 x s64>) = G_ZEXTLOAD %0:_(p1) :: (load (<2 x s32>), addrspace 1) (in function: test_zextload_global_v2i64_from_8)
-# ERR-NEXT: remark: <unknown>:0:0: unable to legalize instruction: %1:_(s128) = G_ZEXTLOAD %0:_(p1) :: (load (s64), addrspace 1) (in function: test_zextload_global_s128_8)
 # ERR-NOT: remark
 
 ---
@@ -492,15 +491,19 @@ body: |
     ; GFX8: liveins: $vgpr0_vgpr1
     ; GFX8-NEXT: {{  $}}
     ; GFX8-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX8-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s128) = G_ZEXTLOAD [[COPY]](p1) :: (load (s64), addrspace 1)
-    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[ZEXTLOAD]](s128)
+    ; GFX8-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
+    ; GFX8-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[C]](s64)
+    ; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     ;
     ; GFX6-LABEL: name: test_zextload_global_s128_8
     ; GFX6: liveins: $vgpr0_vgpr1
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:_(p1) = COPY $vgpr0_vgpr1
-    ; GFX6-NEXT: [[ZEXTLOAD:%[0-9]+]]:_(s128) = G_ZEXTLOAD [[COPY]](p1) :: (load (s64), addrspace 1)
-    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[ZEXTLOAD]](s128)
+    ; GFX6-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[COPY]](p1) :: (load (s64), addrspace 1)
+    ; GFX6-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[LOAD]](s64), [[C]](s64)
+    ; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[MV]](s128)
     %0:_(p1) = COPY $vgpr0_vgpr1
     %1:_(s128) = G_ZEXTLOAD %0 :: (load (s64), addrspace 1)
     $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
index 4511c364b8a7e..b68d148624e74 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/zextload.ll
@@ -443,3 +443,55 @@ define i128 @zextload_global_i32_to_i128(ptr addrspace(1) %ptr) {
   %ext = zext i32 %load to i128
   ret i128 %ext
 }
+
+define i128 @zextload_global_i64_to_i128(ptr addrspace(1) %ptr) {
+; GFX9-LABEL: zextload_global_i64_to_i128:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: zextload_global_i64_to_i128:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: zextload_global_i64_to_i128:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s6, 0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b64 s[4:5], 0
+; GFX6-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0
+; GFX6-NEXT:    s_waitcnt vmcnt(0)
+; GFX6-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: zextload_global_i64_to_i128:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: zextload_global_i64_to_i128:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b64 v[0:1], v[0:1], off
+; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %load = load i64, ptr addrspace(1) %ptr
+  %ext = zext i64 %load to i128
+  ret i128 %ext
+}

From 508e8c51cec7d0dfe527c273d83e4324321e5955 Mon Sep 17 00:00:00 2001
From: Piotr Sobczak <piotr.sobczak@amd.com>
Date: Thu, 14 May 2026 15:28:04 +0200
Subject: [PATCH 74/95] [UniformityAnalysis] Fix nodivergencesource calls
 (#197656)

NFC #168903 introduced a subtle behavior change for calls with the
nodivergencesource attribute and divergent operands.

Calls with the nodivergencesource attribute are *not* always uniform.
They just do not introduce any new divergence. If any operand is
divergent,
the result must still be reported as divergent.

Revert to pre-#168903 behavior by allowing the standard propagation to
work for target's NeverUniform while keeping Default and AlwaysUniform
unchanged.
---
 llvm/lib/Analysis/TargetTransformInfo.cpp         |  9 +++++----
 .../AMDGPU/nodivergencesource.ll                  | 15 +++++++++++++++
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index e1ab90a8e046c..51221a6369e91 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -294,12 +294,13 @@ bool TargetTransformInfo::hasBranchDivergence(const Function *F) const {
 
 ValueUniformity
 llvm::TargetTransformInfo::getValueUniformity(const Value *V) const {
-  // Calls with the NoDivergenceSource attribute are always uniform.
+  ValueUniformity VU = TTIImpl->getValueUniformity(V);
   if (const auto *Call = dyn_cast<CallBase>(V)) {
-    if (Call->hasFnAttr(Attribute::NoDivergenceSource))
-      return ValueUniformity::AlwaysUniform;
+    if (VU == ValueUniformity::NeverUniform &&
+        Call->hasFnAttr(Attribute::NoDivergenceSource))
+      return ValueUniformity::Default;
   }
-  return TTIImpl->getValueUniformity(V);
+  return VU;
 }
 
 bool llvm::TargetTransformInfo::isValidAddrSpaceCast(unsigned FromAS,
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/nodivergencesource.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/nodivergencesource.ll
index 9c893ac3ba76a..79bf57e0c9660 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/nodivergencesource.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/nodivergencesource.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -mtriple amdgcn-- -passes='print<uniformity>' -disable-output %s 2>&1 | FileCheck %s
 
+; CHECK-LABEL: 'test'
 ; CHECK: DIVERGENT: %divergentval
 ; CHECK-NOT: DIVERGENT: %uniformval
 ; CHECK: %uniformval
@@ -9,8 +10,22 @@ define void @test() {
   ret void
 }
 
+; Test a call with "nodivergencesource" attribute and divergent argument.
+; The result of the call should propagate divergence from the operand.
+
+; CHECK-LABEL: 'test_nodivergencesource_divergent_arg'
+; CHECK: DIVERGENT: %tid
+; CHECK: DIVERGENT: %call
+define void @test_nodivergencesource_divergent_arg() {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %call = call i32 @nodivergencesourcefunc_arg(i32 %tid)
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
 declare i32 @normalfunc() #0
 declare i32 @nodivergencesourcefunc() #1
+declare i32 @nodivergencesourcefunc_arg(i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind nodivergencesource }

From 4e2ad71ace933e5c5f860ad389d024dddecaab44 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Thu, 14 May 2026 09:31:33 -0400
Subject: [PATCH 75/95] [NFC] Format two AMDGPU files (#197672)

- `llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp`
- `llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp`
---
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   | 24 ++++++++-----
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     | 36 ++++++++++---------
 2 files changed, 35 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index ad61d8d084c7b..6e8ad3f9f1a24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1507,14 +1507,22 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
 static unsigned getRsrcReg(CallingConv::ID CallConv) {
   switch (CallConv) {
-  default: [[fallthrough]];
-  case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
-  case CallingConv::AMDGPU_LS: return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
-  case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
-  case CallingConv::AMDGPU_ES: return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
-  case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
-  case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
-  case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
+  default:
+    [[fallthrough]];
+  case CallingConv::AMDGPU_CS:
+    return R_00B848_COMPUTE_PGM_RSRC1;
+  case CallingConv::AMDGPU_LS:
+    return R_00B528_SPI_SHADER_PGM_RSRC1_LS;
+  case CallingConv::AMDGPU_HS:
+    return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
+  case CallingConv::AMDGPU_ES:
+    return R_00B328_SPI_SHADER_PGM_RSRC1_ES;
+  case CallingConv::AMDGPU_GS:
+    return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
+  case CallingConv::AMDGPU_VS:
+    return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
+  case CallingConv::AMDGPU_PS:
+    return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
   }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 53c21837a11dd..d6509fdd2b21a 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -237,7 +237,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
 
 AMDGPUTargetAsmStreamer::AMDGPUTargetAsmStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS)
-    : AMDGPUTargetStreamer(S), OS(OS) { }
+    : AMDGPUTargetStreamer(S), OS(OS) {}
 
 // A hook for emitting stuff at the end.
 // We use it for emitting the accumulated PAL metadata as directives.
@@ -276,10 +276,11 @@ void AMDGPUTargetAsmStreamer::EmitAMDKernelCodeT(AMDGPUMCKernelCodeT &Header) {
 void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
                                                    unsigned Type) {
   switch (Type) {
-    default: llvm_unreachable("Invalid AMDGPU symbol type");
-    case ELF::STT_AMDGPU_HSA_KERNEL:
-      OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n' ;
-      break;
+  default:
+    llvm_unreachable("Invalid AMDGPU symbol type");
+  case ELF::STT_AMDGPU_HSA_KERNEL:
+    OS << "\t.amdgpu_hsa_kernel " << SymbolName << '\n';
+    break;
   }
 }
 
@@ -337,8 +338,8 @@ bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
   return true;
 }
 
-bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
-    msgpack::Document &HSAMetadataDoc, bool Strict) {
+bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
+                                              bool Strict) {
   HSAMD::V3::MetadataVerifier Verifier(Strict);
   if (!Verifier.verify(HSAMetadataDoc.getRoot()))
     return false;
@@ -548,7 +549,8 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
   case AMDGPU::AMDHSA_COV4:
   case AMDGPU::AMDHSA_COV5:
     if (getTargetID()->isXnackSupported())
-      OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
+      OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny()
+         << '\n';
     break;
   }
 
@@ -815,13 +817,13 @@ void AMDGPUTargetELFStreamer::EmitNote(
   S.pushSection();
   S.switchSection(
       Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags));
-  S.emitInt32(NameSZ);                                        // namesz
-  S.emitValue(DescSZ, 4);                                     // descz
-  S.emitInt32(NoteType);                                      // type
-  S.emitBytes(Name);                                          // name
-  S.emitValueToAlignment(Align(4), 0, 1, 0);                  // padding 0
-  EmitDesc(S);                                                // desc
-  S.emitValueToAlignment(Align(4), 0, 1, 0);                  // padding 0
+  S.emitInt32(NameSZ);                       // namesz
+  S.emitValue(DescSZ, 4);                    // descz
+  S.emitInt32(NoteType);                     // type
+  S.emitBytes(Name);                         // name
+  S.emitValueToAlignment(Align(4), 0, 1, 0); // padding 0
+  EmitDesc(S);                               // desc
+  S.emitValueToAlignment(Align(4), 0, 1, 0); // padding 0
   S.popSection();
 }
 
@@ -1027,8 +1029,8 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion() {
   auto *DescBegin = Context.createTempSymbol();
   auto *DescEnd = Context.createTempSymbol();
   auto *DescSZ = MCBinaryExpr::createSub(
-    MCSymbolRefExpr::create(DescEnd, Context),
-    MCSymbolRefExpr::create(DescBegin, Context), Context);
+      MCSymbolRefExpr::create(DescEnd, Context),
+      MCSymbolRefExpr::create(DescBegin, Context), Context);
 
   EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_ISA_NAME,
            [&](MCELFStreamer &OS) {

From f816732ca160f6a92fd59d5e16ab86ebbbb5b65d Mon Sep 17 00:00:00 2001
From: Harrison Hao <57025411+harrisonGPU@users.noreply.github.com>
Date: Thu, 14 May 2026 21:31:39 +0800
Subject: [PATCH 76/95] [AMDGPU][NFC] Autogenerate checks in andorn2.ll
 (#197613)

For this PR: https://github.com/llvm/llvm-project/pull/196325
---
 llvm/test/CodeGen/AMDGPU/andorn2.ll | 499 ++++++++++++++++++++++++++--
 1 file changed, 463 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/andorn2.ll b/llvm/test/CodeGen/AMDGPU/andorn2.ll
index 1527d50e28b35..cd322b68c82f7 100644
--- a/llvm/test/CodeGen/AMDGPU/andorn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorn2.ll
@@ -1,12 +1,65 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck --check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx801 < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12 %s
 
-; GCN-LABEL: {{^}}scalar_andn2_i32_one_use
-; GCN: s_andn2_b32
-define amdgpu_kernel void @scalar_andn2_i32_one_use(
-    ptr addrspace(1) %r0, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_andn2_i32_one_use(ptr addrspace(1) %r0, i32 %a, i32 %b) {
+; GFX6-LABEL: scalar_andn2_i32_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_andn2_b32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: scalar_andn2_i32_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX7-NEXT:    s_andn2_b32 s4, s4, s5
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: scalar_andn2_i32_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_andn2_b32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_andn2_i32_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_andn2_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: scalar_andn2_i32_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_not1_b32 s2, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %nb = xor i32 %b, -1
   %r0.val = and i32 %a, %nb
@@ -14,10 +67,73 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}scalar_andn2_i64_one_use
-; GCN: s_andn2_b64
-define amdgpu_kernel void @scalar_andn2_i64_one_use(
-    ptr addrspace(1) %r0, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_andn2_i64_one_use(ptr addrspace(1) %r0, i64 %a, i64 %b) {
+; GFX6-LABEL: scalar_andn2_i64_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: scalar_andn2_i64_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s0
+; GFX7-NEXT:    s_mov_b32 s5, s1
+; GFX7-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: scalar_andn2_i64_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_andn2_i64_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: scalar_andn2_i64_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %nb = xor i64 %b, -1
   %r0.val = and i64 %a, %nb
@@ -25,10 +141,61 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}scalar_orn2_i32_one_use
-; GCN: s_orn2_b32
-define amdgpu_kernel void @scalar_orn2_i32_one_use(
-    ptr addrspace(1) %r0, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_orn2_i32_one_use(ptr addrspace(1) %r0, i32 %a, i32 %b) {
+; GFX6-LABEL: scalar_orn2_i32_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX6-NEXT:    s_orn2_b32 s4, s4, s5
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: scalar_orn2_i32_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX7-NEXT:    s_orn2_b32 s4, s4, s5
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: scalar_orn2_i32_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_orn2_b32 s2, s2, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_orn2_i32_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_orn2_b32 s2, s2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v1, s2
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: scalar_orn2_i32_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_or_not1_b32 s2, s2, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %nb = xor i32 %b, -1
   %r0.val = or i32 %a, %nb
@@ -36,10 +203,73 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}scalar_orn2_i64_one_use
-; GCN: s_orn2_b64
-define amdgpu_kernel void @scalar_orn2_i64_one_use(
-    ptr addrspace(1) %r0, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_orn2_i64_one_use(ptr addrspace(1) %r0, i64 %a, i64 %b) {
+; GFX6-LABEL: scalar_orn2_i64_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX6-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    v_mov_b32_e32 v0, s0
+; GFX6-NEXT:    v_mov_b32_e32 v1, s1
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: scalar_orn2_i64_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0xd
+; GFX7-NEXT:    s_mov_b32 s7, 0xf000
+; GFX7-NEXT:    s_mov_b32 s6, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_mov_b32 s4, s0
+; GFX7-NEXT:    s_mov_b32 s5, s1
+; GFX7-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[8:9]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: scalar_orn2_i64_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_orn2_b64 s[0:1], s[2:3], s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: scalar_orn2_i64_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_orn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: scalar_orn2_i64_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_or_not1_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %nb = xor i64 %b, -1
   %r0.val = or i64 %a, %nb
@@ -47,10 +277,59 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}vector_andn2_i32_s_v_one_use
-; GCN: v_bfi_b32
-define amdgpu_kernel void @vector_andn2_i32_s_v_one_use(
-    ptr addrspace(1) %r0, i32 %s) {
+define amdgpu_kernel void @vector_andn2_i32_s_v_one_use(ptr addrspace(1) %r0, i32 %s) {
+; GFX6-LABEL: vector_andn2_i32_s_v_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_bfi_b32 v0, v0, 0, s6
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: vector_andn2_i32_s_v_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_bfi_b32 v0, v0, 0, s6
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: vector_andn2_i32_s_v_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_bfi_b32 v2, v0, 0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: vector_andn2_i32_s_v_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, v0, 0, s2
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: vector_andn2_i32_s_v_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    v_bfi_b32 v0, v0, 0xfffffc00, -1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, s2, v0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %v = call i32 @llvm.amdgcn.workitem.id.x() #1
   %not = xor i32 %v, -1
@@ -59,10 +338,59 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}vector_andn2_i32_v_s_one_use
-; GCN: v_bfi_b32
-define amdgpu_kernel void @vector_andn2_i32_v_s_one_use(
-    ptr addrspace(1) %r0, i32 %s) {
+define amdgpu_kernel void @vector_andn2_i32_v_s_one_use(ptr addrspace(1) %r0, i32 %s) {
+; GFX6-LABEL: vector_andn2_i32_v_s_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_bfi_b32 v0, s6, 0, v0
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: vector_andn2_i32_v_s_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_bfi_b32 v0, s6, 0, v0
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: vector_andn2_i32_v_s_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: vector_andn2_i32_v_s_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s2, 0, v0
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: vector_andn2_i32_v_s_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_bfi_b32 v0, s2, 0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %v = call i32 @llvm.amdgcn.workitem.id.x() #1
   %not = xor i32 %s, -1
@@ -71,10 +399,60 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}vector_orn2_i32_s_v_one_use
-; GCN: v_bfi_b32
-define amdgpu_kernel void @vector_orn2_i32_s_v_one_use(
-    ptr addrspace(1) %r0, i32 %s) {
+define amdgpu_kernel void @vector_orn2_i32_s_v_one_use(ptr addrspace(1) %r0, i32 %s) {
+; GFX6-LABEL: vector_orn2_i32_s_v_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_bfi_b32 v0, v0, s6, -1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: vector_orn2_i32_s_v_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_bfi_b32 v0, v0, s6, -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: vector_orn2_i32_s_v_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_bfi_b32 v2, v0, s2, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: vector_orn2_i32_s_v_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, v0, s2, -1
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: vector_orn2_i32_s_v_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    v_not_b32_e32 v0, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_or3_b32 v0, v0, s2, 0xfffffc00
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %v = call i32 @llvm.amdgcn.workitem.id.x() #1
   %not = xor i32 %v, -1
@@ -83,10 +461,59 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}vector_orn2_i32_v_s_one_use
-; GCN: v_bfi_b32
-define amdgpu_kernel void @vector_orn2_i32_v_s_one_use(
-    ptr addrspace(1) %r0, i32 %s) {
+define amdgpu_kernel void @vector_orn2_i32_v_s_one_use(ptr addrspace(1) %r0, i32 %s) {
+; GFX6-LABEL: vector_orn2_i32_v_s_one_use:
+; GFX6:       ; %bb.0: ; %entry
+; GFX6-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    v_bfi_b32 v0, s6, v0, -1
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX7-LABEL: vector_orn2_i32_v_s_one_use:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dword s6, s[4:5], 0xb
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
+; GFX7-NEXT:    s_mov_b32 s2, -1
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_bfi_b32 v0, s6, v0, -1
+; GFX7-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: vector_orn2_i32_v_s_one_use:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_bfi_b32 v2, s2, v0, -1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_store_dword v[0:1], v2
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: vector_orn2_i32_v_s_one_use:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_bfi_b32 v0, s2, v0, -1
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: vector_orn2_i32_v_s_one_use:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_bfi_b32 v0, s2, v0, -1
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %v = call i32 @llvm.amdgcn.workitem.id.x() #1
   %not = xor i32 %s, -1

From c4054b58d0295764a083e8319eb7782402af75b3 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <artagnon@tenstorrent.com>
Date: Thu, 14 May 2026 06:32:01 -0700
Subject: [PATCH 77/95] [VPlan] Simplify BCast with onlyScalarsUsed (#195444)

---
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  4 ++
 .../X86/cost-conditional-branches.ll          | 48 ++++++++-----------
 .../X86/pr109581-unused-blend.ll              |  5 +-
 ...licate-recipe-with-only-first-lane-used.ll |  5 +-
 .../LoopVectorize/multi_early_exit.ll         |  3 +-
 llvm/test/Transforms/LoopVectorize/pr37248.ll |  5 +-
 6 files changed, 29 insertions(+), 41 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4acc343bdb60b..d18488ccf69db 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1680,6 +1680,10 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     return;
   }
 
+  if (match(Def, m_Broadcast(m_VPValue(X))))
+    return Def->replaceUsesWithIf(
+        X, [Def](const VPUser &U, unsigned) { return U.usesScalars(Def); });
+
   if (isa<VPPhi, VPWidenPHIRecipe, VPHeaderPHIRecipe>(Def)) {
     if (Def->getNumOperands() == 1) {
       Def->replaceAllUsesWith(Def->getOperand(0));
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
index 314118a308034..766f88482c1d5 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-conditional-branches.ll
@@ -873,10 +873,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <32 x i1> poison, i1 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <32 x i1> [[BROADCAST_SPLATINSERT]], <32 x i1> poison, <32 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <32 x i1> [[BROADCAST_SPLAT]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0:%.*]], true
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE62:%.*]] ]
@@ -1144,17 +1141,14 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK-NEXT:    br i1 [[TMP163]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP207:%.*]] = zext <32 x i32> [[TMP161]] to <32 x i64>
-; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[C]], <32 x i64> zeroinitializer, <32 x i64> [[TMP207]]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select i1 [[TMP0]], <32 x i64> zeroinitializer, <32 x i64> [[TMP207]]
 ; CHECK-NEXT:    [[TMP164:%.*]] = extractelement <32 x i64> [[PREDPHI]], i64 31
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF13:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT63:%.*]] = insertelement <8 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT64:%.*]] = shufflevector <8 x i1> [[BROADCAST_SPLATINSERT63]], <8 x i1> poison, <8 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP137:%.*]] = extractelement <8 x i1> [[BROADCAST_SPLAT64]], i64 0
-; CHECK-NEXT:    [[TMP167:%.*]] = xor i1 [[TMP137]], true
+; CHECK-NEXT:    [[TMP167:%.*]] = xor i1 [[TMP0]], true
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT65:%.*]] = insertelement <8 x i32> poison, i32 [[BC_RESUME_VAL]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT66:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT65]], <8 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i32> [[BROADCAST_SPLAT66]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -1164,68 +1158,68 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK-NEXT:    [[VEC_IND68:%.*]] = phi <8 x i32> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT87:%.*]], [[PRED_UDIV_CONTINUE84]] ]
 ; CHECK-NEXT:    [[TMP166:%.*]] = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> [[VEC_IND68]], <8 x i32> splat (i32 1))
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF69:%.*]], label [[PRED_UDIV_CONTINUE70:%.*]]
-; CHECK:       pred.udiv.if69:
+; CHECK:       pred.udiv.if65:
 ; CHECK-NEXT:    [[TMP168:%.*]] = extractelement <8 x i32> [[TMP166]], i64 0
 ; CHECK-NEXT:    [[TMP169:%.*]] = udiv i32 [[TMP168]], [[D]]
 ; CHECK-NEXT:    [[TMP170:%.*]] = insertelement <8 x i32> poison, i32 [[TMP169]], i64 0
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE70]]
-; CHECK:       pred.udiv.continue70:
+; CHECK:       pred.udiv.continue66:
 ; CHECK-NEXT:    [[TMP171:%.*]] = phi <8 x i32> [ poison, [[VEC_EPILOG_VECTOR_BODY]] ], [ [[TMP170]], [[PRED_UDIV_IF69]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF71:%.*]], label [[PRED_UDIV_CONTINUE72:%.*]]
-; CHECK:       pred.udiv.if71:
+; CHECK:       pred.udiv.if67:
 ; CHECK-NEXT:    [[TMP173:%.*]] = extractelement <8 x i32> [[TMP166]], i64 1
 ; CHECK-NEXT:    [[TMP174:%.*]] = udiv i32 [[TMP173]], [[D]]
 ; CHECK-NEXT:    [[TMP175:%.*]] = insertelement <8 x i32> [[TMP171]], i32 [[TMP174]], i64 1
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE72]]
-; CHECK:       pred.udiv.continue72:
+; CHECK:       pred.udiv.continue68:
 ; CHECK-NEXT:    [[TMP176:%.*]] = phi <8 x i32> [ [[TMP171]], [[PRED_UDIV_CONTINUE70]] ], [ [[TMP175]], [[PRED_UDIV_IF71]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF73:%.*]], label [[PRED_UDIV_CONTINUE74:%.*]]
-; CHECK:       pred.udiv.if73:
+; CHECK:       pred.udiv.if69:
 ; CHECK-NEXT:    [[TMP178:%.*]] = extractelement <8 x i32> [[TMP166]], i64 2
 ; CHECK-NEXT:    [[TMP179:%.*]] = udiv i32 [[TMP178]], [[D]]
 ; CHECK-NEXT:    [[TMP180:%.*]] = insertelement <8 x i32> [[TMP176]], i32 [[TMP179]], i64 2
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE74]]
-; CHECK:       pred.udiv.continue74:
+; CHECK:       pred.udiv.continue70:
 ; CHECK-NEXT:    [[TMP181:%.*]] = phi <8 x i32> [ [[TMP176]], [[PRED_UDIV_CONTINUE72]] ], [ [[TMP180]], [[PRED_UDIV_IF73]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF75:%.*]], label [[PRED_UDIV_CONTINUE76:%.*]]
-; CHECK:       pred.udiv.if75:
+; CHECK:       pred.udiv.if71:
 ; CHECK-NEXT:    [[TMP183:%.*]] = extractelement <8 x i32> [[TMP166]], i64 3
 ; CHECK-NEXT:    [[TMP184:%.*]] = udiv i32 [[TMP183]], [[D]]
 ; CHECK-NEXT:    [[TMP185:%.*]] = insertelement <8 x i32> [[TMP181]], i32 [[TMP184]], i64 3
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE76]]
-; CHECK:       pred.udiv.continue76:
+; CHECK:       pred.udiv.continue72:
 ; CHECK-NEXT:    [[TMP186:%.*]] = phi <8 x i32> [ [[TMP181]], [[PRED_UDIV_CONTINUE74]] ], [ [[TMP185]], [[PRED_UDIV_IF75]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF77:%.*]], label [[PRED_UDIV_CONTINUE78:%.*]]
-; CHECK:       pred.udiv.if77:
+; CHECK:       pred.udiv.if73:
 ; CHECK-NEXT:    [[TMP188:%.*]] = extractelement <8 x i32> [[TMP166]], i64 4
 ; CHECK-NEXT:    [[TMP189:%.*]] = udiv i32 [[TMP188]], [[D]]
 ; CHECK-NEXT:    [[TMP190:%.*]] = insertelement <8 x i32> [[TMP186]], i32 [[TMP189]], i64 4
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE78]]
-; CHECK:       pred.udiv.continue78:
+; CHECK:       pred.udiv.continue74:
 ; CHECK-NEXT:    [[TMP191:%.*]] = phi <8 x i32> [ [[TMP186]], [[PRED_UDIV_CONTINUE76]] ], [ [[TMP190]], [[PRED_UDIV_IF77]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF79:%.*]], label [[PRED_UDIV_CONTINUE80:%.*]]
-; CHECK:       pred.udiv.if79:
+; CHECK:       pred.udiv.if75:
 ; CHECK-NEXT:    [[TMP193:%.*]] = extractelement <8 x i32> [[TMP166]], i64 5
 ; CHECK-NEXT:    [[TMP194:%.*]] = udiv i32 [[TMP193]], [[D]]
 ; CHECK-NEXT:    [[TMP195:%.*]] = insertelement <8 x i32> [[TMP191]], i32 [[TMP194]], i64 5
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE80]]
-; CHECK:       pred.udiv.continue80:
+; CHECK:       pred.udiv.continue76:
 ; CHECK-NEXT:    [[TMP196:%.*]] = phi <8 x i32> [ [[TMP191]], [[PRED_UDIV_CONTINUE78]] ], [ [[TMP195]], [[PRED_UDIV_IF79]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF81:%.*]], label [[PRED_UDIV_CONTINUE82:%.*]]
-; CHECK:       pred.udiv.if81:
+; CHECK:       pred.udiv.if77:
 ; CHECK-NEXT:    [[TMP198:%.*]] = extractelement <8 x i32> [[TMP166]], i64 6
 ; CHECK-NEXT:    [[TMP199:%.*]] = udiv i32 [[TMP198]], [[D]]
 ; CHECK-NEXT:    [[TMP200:%.*]] = insertelement <8 x i32> [[TMP196]], i32 [[TMP199]], i64 6
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE82]]
-; CHECK:       pred.udiv.continue82:
+; CHECK:       pred.udiv.continue78:
 ; CHECK-NEXT:    [[TMP201:%.*]] = phi <8 x i32> [ [[TMP196]], [[PRED_UDIV_CONTINUE80]] ], [ [[TMP200]], [[PRED_UDIV_IF81]] ]
 ; CHECK-NEXT:    br i1 [[TMP167]], label [[PRED_UDIV_IF83:%.*]], label [[PRED_UDIV_CONTINUE84]]
-; CHECK:       pred.udiv.if83:
+; CHECK:       pred.udiv.if79:
 ; CHECK-NEXT:    [[TMP203:%.*]] = extractelement <8 x i32> [[TMP166]], i64 7
 ; CHECK-NEXT:    [[TMP204:%.*]] = udiv i32 [[TMP203]], [[D]]
 ; CHECK-NEXT:    [[TMP205:%.*]] = insertelement <8 x i32> [[TMP201]], i32 [[TMP204]], i64 7
 ; CHECK-NEXT:    br label [[PRED_UDIV_CONTINUE84]]
-; CHECK:       pred.udiv.continue84:
+; CHECK:       pred.udiv.continue80:
 ; CHECK-NEXT:    [[TMP206:%.*]] = phi <8 x i32> [ [[TMP201]], [[PRED_UDIV_CONTINUE82]] ], [ [[TMP205]], [[PRED_UDIV_IF83]] ]
 ; CHECK-NEXT:    [[INDEX_NEXT86]] = add nuw i32 [[INDEX67]], 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT87]] = add <8 x i32> [[VEC_IND68]], splat (i32 8)
@@ -1233,7 +1227,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK-NEXT:    br i1 [[TMP208]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP210:%.*]] = zext <8 x i32> [[TMP206]] to <8 x i64>
-; CHECK-NEXT:    [[PREDPHI85:%.*]] = select i1 [[C]], <8 x i64> zeroinitializer, <8 x i64> [[TMP210]]
+; CHECK-NEXT:    [[PREDPHI85:%.*]] = select i1 [[TMP0]], <8 x i64> zeroinitializer, <8 x i64> [[TMP210]]
 ; CHECK-NEXT:    [[TMP209:%.*]] = extractelement <8 x i64> [[PREDPHI85]], i64 7
 ; CHECK-NEXT:    br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -1241,7 +1235,7 @@ define i64 @test_predicated_udiv(i32 %d, i1 %c) #2 {
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL88]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
-; CHECK-NEXT:    br i1 [[C]], label [[LOOP_LATCH]], label [[THEN:%.*]]
+; CHECK-NEXT:    br i1 [[TMP0]], label [[LOOP_LATCH]], label [[THEN:%.*]]
 ; CHECK:       then:
 ; CHECK-NEXT:    [[CALL:%.*]] = tail call i32 @llvm.usub.sat.i32(i32 [[IV]], i32 1)
 ; CHECK-NEXT:    [[UDIV:%.*]] = udiv i32 [[CALL]], [[D]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
index a5692ae9fef0d..0323a6771949d 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr109581-unused-blend.ll
@@ -12,12 +12,9 @@ define i32 @unused_blend_after_unrolling(ptr %p, i32 %a, i1 %c.1, i16 %x, i16 %y
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[A]], 0
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C_1]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT17:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT16]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[C_1]], true
 ; CHECK-NEXT:    [[TMP22:%.*]] = xor <4 x i1> [[BROADCAST_SPLAT17]], splat (i1 true)
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
diff --git a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
index adb5453d0404a..485dc0e16952a 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/replicate-recipe-with-only-first-lane-used.ll
@@ -71,10 +71,7 @@ define void @replicate_udiv_with_only_first_lane_used2(i32 %x, ptr %dst, i64 %d)
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[X]], 10
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i1> [[BROADCAST_SPLATINSERT]], <4 x i1> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i64 0
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[TMP0]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[C]], true
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE14:.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll
index 46faf8168bd38..e452ccd7f05e5 100644
--- a/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll
@@ -151,8 +151,7 @@ define i64 @early_exit_with_live_in_condition(i1 %cond) {
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[LOOP_END:%.*]]
 ; CHECK:       vector.early.exit.check:
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i1> [[BROADCAST_SPLAT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_EARLY_EXIT_0:%.*]], label [[VECTOR_EARLY_EXIT_1:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[VECTOR_EARLY_EXIT_0:%.*]], label [[VECTOR_EARLY_EXIT_1:%.*]]
 ; CHECK:       vector.early.exit.1:
 ; CHECK-NEXT:    br label [[LOOP_END]]
 ; CHECK:       vector.early.exit.0:
diff --git a/llvm/test/Transforms/LoopVectorize/pr37248.ll b/llvm/test/Transforms/LoopVectorize/pr37248.ll
index 03e3d1f222399..c2e26c5d66877 100644
--- a/llvm/test/Transforms/LoopVectorize/pr37248.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr37248.ll
@@ -34,11 +34,8 @@ define void @f1(ptr noalias %b, i1 %c, i32 %start) {
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i1> poison, i1 [[C]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i1> [[BROADCAST_SPLATINSERT]], <2 x i1> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub i32 [[START]], [[N_VEC]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP11]], true
+; CHECK-NEXT:    [[TMP12:%.*]] = xor i1 [[C]], true
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]

From ca3a21085580454b5687e1f93ad786116ce2ba11 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 14 May 2026 14:39:29 +0100
Subject: [PATCH 78/95] [AArch64][GlobalISel] Add always legal action builders.
 (#197238)

This defined some always legal actions, removing our dependency on the
Legacy ruleset in aarch64.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp      |  5 +++++
 .../GlobalISel/legalizer-info-validation.mir    | 17 +++++++++--------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 4c7abbfb871af..0db0c937e7c8a 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1509,6 +1509,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
 
   getActionDefinitionsBuilder({G_SCMP, G_UCMP}).lower();
 
+  getActionDefinitionsBuilder({G_INTRINSIC, G_INTRINSIC_W_SIDE_EFFECTS})
+      .alwaysLegal();
+  getActionDefinitionsBuilder(G_FENCE).alwaysLegal();
+  getActionDefinitionsBuilder(G_INVOKE_REGION_START).alwaysLegal();
+
   getLegacyLegalizerInfo().computeTables();
   verify(*ST.getInstrInfo());
 }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 70dbeb7d49f65..bd7681542a0fc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -319,8 +319,8 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FENCE (opcode {{[0-9]+}}): 0 type indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_PREFETCH (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
@@ -331,14 +331,15 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INVOKE_REGION_START (opcode {{[0-9]+}}): 0 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT:  .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT:  .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INTRINSIC (opcode {{[0-9]+}}): 0 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INTRINSIC_W_SIDE_EFFECTS (opcode {{[0-9]+}}): 0 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_INTRINSIC_CONVERGENT (opcode {{[0-9]+}}): 0 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined

From 7c7ed927171c82a1ea789c8264477bf0b54da69b Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 14 May 2026 10:47:19 -0300
Subject: [PATCH 79/95] [clang] CTAD: fix transformation of template template
 parameters (#197611)

This fixes the CTAD template parameter transforms so they produce
template template parameters which have correct depth for their own
template parameters.

This also stops calling SubstDecl directly on the non-type template
parameters, so that a template parameter with correct position is
produced directly, instead of manually fixing that up later. This helps
#197598 by making it possible to add assertions that the positions are
always valid.
---
 clang/lib/Sema/SemaTemplateDeductionGuide.cpp | 174 +++++++++++++++---
 clang/test/AST/ast-dump-ctad-alias.cpp        |   2 +-
 clang/test/SemaTemplate/deduction-guide.cpp   |   4 +-
 3 files changed, 154 insertions(+), 26 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
index c6502105b6acd..fa740d5581e5f 100644
--- a/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
+++ b/clang/lib/Sema/SemaTemplateDeductionGuide.cpp
@@ -311,10 +311,11 @@ buildDeductionGuide(Sema &SemaRef, TemplateDecl *OriginalTemplate,
 }
 
 // Transform a given template type parameter `TTP`.
-TemplateTypeParmDecl *transformTemplateTypeParam(
-    Sema &SemaRef, DeclContext *DC, TemplateTypeParmDecl *TTP,
-    MultiLevelTemplateArgumentList &Args, unsigned NewDepth, unsigned NewIndex,
-    bool EvaluateConstraint) {
+TemplateTypeParmDecl *
+transformTemplateParam(Sema &SemaRef, DeclContext *DC,
+                       TemplateTypeParmDecl *TTP,
+                       MultiLevelTemplateArgumentList &Args, unsigned NewDepth,
+                       unsigned NewIndex, bool EvaluateConstraint) {
   // TemplateTypeParmDecl's index cannot be changed after creation, so
   // substitute it directly.
   auto *NewTTP = TemplateTypeParmDecl::Create(
@@ -335,20 +336,131 @@ TemplateTypeParmDecl *transformTemplateTypeParam(
   SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP);
   return NewTTP;
 }
-// Similar to above, but for non-type template or template template parameters.
-template <typename NonTypeTemplateOrTemplateTemplateParmDecl>
-NonTypeTemplateOrTemplateTemplateParmDecl *
+
+NonTypeTemplateParmDecl *
 transformTemplateParam(Sema &SemaRef, DeclContext *DC,
-                       NonTypeTemplateOrTemplateTemplateParmDecl *OldParam,
-                       MultiLevelTemplateArgumentList &Args, unsigned NewIndex,
-                       unsigned NewDepth) {
-  // Ask the template instantiator to do the heavy lifting for us, then adjust
-  // the index of the parameter once it's done.
-  auto *NewParam = cast<NonTypeTemplateOrTemplateTemplateParmDecl>(
-      SemaRef.SubstDecl(OldParam, DC, Args));
-  NewParam->setPosition(NewIndex);
-  NewParam->setDepth(NewDepth);
-  return NewParam;
+                       NonTypeTemplateParmDecl *TTP, unsigned NewDepth,
+                       unsigned NewIndex,
+                       MultiLevelTemplateArgumentList &Args) {
+  NonTypeTemplateParmDecl *NewTTP;
+  if (TTP->isExpandedParameterPack()) {
+    SmallVector<TypeSourceInfo *, 4> ExpandedTypeSourceInfos(
+        TTP->getNumExpansionTypes());
+    SmallVector<QualType, 4> ExpandedTypes(TTP->getNumExpansionTypes());
+    for (unsigned I = 0, N = TTP->getNumExpansionTypes(); I != N; ++I) {
+      TypeSourceInfo *NewTSI =
+          SemaRef.SubstType(TTP->getExpansionTypeSourceInfo(I), Args,
+                            TTP->getLocation(), TTP->getDeclName());
+      assert(NewTSI);
+
+      QualType NewT =
+          SemaRef.CheckNonTypeTemplateParameterType(NewTSI, TTP->getLocation());
+      assert(!NewT.isNull());
+
+      ExpandedTypeSourceInfos[I] = NewTSI;
+      ExpandedTypes[I] = NewT;
+    }
+    NewTTP = NonTypeTemplateParmDecl::Create(
+        SemaRef.Context, DC, TTP->getBeginLoc(), TTP->getLocation(), NewDepth,
+        NewIndex, TTP->getIdentifier(), TTP->getType(),
+        TTP->getTypeSourceInfo(), ExpandedTypes, ExpandedTypeSourceInfos);
+  } else {
+    TypeSourceInfo *NewTSI = SemaRef.SubstType(
+        TTP->getTypeSourceInfo(), Args, TTP->getLocation(), TTP->getDeclName());
+    assert(NewTSI);
+
+    QualType NewT =
+        SemaRef.CheckNonTypeTemplateParameterType(NewTSI, TTP->getLocation());
+    assert(!NewT.isNull());
+
+    NewTTP = NonTypeTemplateParmDecl::Create(
+        SemaRef.Context, DC, TTP->getBeginLoc(), TTP->getLocation(), NewDepth,
+        NewIndex, TTP->getIdentifier(), NewT, TTP->isParameterPack(), NewTSI);
+  }
+
+  if (TypeSourceInfo *TSI = TTP->getTypeSourceInfo();
+      AutoTypeLoc AutoLoc = TSI->getTypeLoc().getContainedAutoTypeLoc()) {
+    if (AutoLoc.isConstrained()) {
+      SourceLocation EllipsisLoc;
+      if (TTP->isExpandedParameterPack())
+        EllipsisLoc =
+            TSI->getTypeLoc().getAs<PackExpansionTypeLoc>().getEllipsisLoc();
+      else if (auto *Constraint = dyn_cast_if_present<CXXFoldExpr>(
+                   TTP->getPlaceholderTypeConstraint()))
+        EllipsisLoc = Constraint->getEllipsisLoc();
+      // Note: We attach the non-instantiated constraint here, so that it can be
+      // instantiated relative to the top level, like all our other
+      // constraints.
+      if (SemaRef.AttachTypeConstraint(AutoLoc, /*NewConstrainedParm=*/NewTTP,
+                                       /*OrigConstrainedParm=*/TTP,
+                                       EllipsisLoc))
+        llvm_unreachable("unexpected failure attaching type constraint");
+    }
+  }
+
+  NewTTP->setAccess(AS_public);
+  NewTTP->setImplicit(TTP->isImplicit());
+
+  if (TTP->hasDefaultArgument()) {
+    TemplateArgumentLoc InstantiatedDefaultArg;
+    if (!SemaRef.SubstTemplateArgument(
+            TTP->getDefaultArgument(), Args, InstantiatedDefaultArg,
+            TTP->getDefaultArgumentLoc(), TTP->getDeclName()))
+      NewTTP->setDefaultArgument(SemaRef.Context, InstantiatedDefaultArg);
+  }
+
+  SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP);
+  return NewTTP;
+}
+
+TemplateParameterList *
+transformTemplateParameters(Sema &SemaRef, DeclContext *DC,
+                            TemplateParameterList *TPL,
+                            MultiLevelTemplateArgumentList &Args,
+                            unsigned NewDepth, bool EvaluateConstraint);
+
+TemplateTemplateParmDecl *
+transformTemplateParam(Sema &SemaRef, DeclContext *DC,
+                       TemplateTemplateParmDecl *TTP, unsigned NewDepth,
+                       unsigned NewIndex, MultiLevelTemplateArgumentList &Args,
+                       bool EvaluateConstraint) {
+  TemplateTemplateParmDecl *NewTTP;
+  if (TTP->isExpandedParameterPack()) {
+    SmallVector<TemplateParameterList *, 4> ExpandedTPLs(
+        TTP->getNumExpansionTemplateParameters());
+    for (unsigned I = 0, N = TTP->getNumExpansionTemplateParameters(); I != N;
+         ++I)
+      ExpandedTPLs[I] = transformTemplateParameters(
+          SemaRef, DC, TTP->getExpansionTemplateParameters(I), Args,
+          NewDepth + 1, EvaluateConstraint);
+    NewTTP = TemplateTemplateParmDecl::Create(
+        SemaRef.Context, DC, TTP->getLocation(), NewDepth, NewIndex,
+        TTP->getIdentifier(), TTP->templateParameterKind(),
+        TTP->wasDeclaredWithTypename(), TTP->getTemplateParameters(),
+        ExpandedTPLs);
+  } else {
+    TemplateParameterList *NewTPL =
+        transformTemplateParameters(SemaRef, DC, TTP->getTemplateParameters(),
+                                    Args, NewDepth + 1, EvaluateConstraint);
+    NewTTP = TemplateTemplateParmDecl::Create(
+        SemaRef.Context, DC, TTP->getLocation(), NewDepth, NewIndex,
+        TTP->isParameterPack(), TTP->getIdentifier(),
+        TTP->templateParameterKind(), TTP->wasDeclaredWithTypename(), NewTPL);
+  }
+
+  NewTTP->setAccess(AS_public);
+  NewTTP->setImplicit(TTP->isImplicit());
+
+  if (TTP->hasDefaultArgument()) {
+    TemplateArgumentLoc InstantiatedDefaultArg;
+    if (!SemaRef.SubstTemplateArgument(
+            TTP->getDefaultArgument(), Args, InstantiatedDefaultArg,
+            TTP->getDefaultArgumentLoc(), TTP->getDeclName()))
+      NewTTP->setDefaultArgument(SemaRef.Context, InstantiatedDefaultArg);
+  }
+
+  SemaRef.CurrentInstantiationScope->InstantiatedLocal(TTP, NewTTP);
+  return NewTTP;
 }
 
 NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
@@ -357,16 +469,32 @@ NamedDecl *transformTemplateParameter(Sema &SemaRef, DeclContext *DC,
                                       unsigned NewIndex, unsigned NewDepth,
                                       bool EvaluateConstraint = true) {
   if (auto *TTP = dyn_cast<TemplateTypeParmDecl>(TemplateParam))
-    return transformTemplateTypeParam(
-        SemaRef, DC, TTP, Args, NewDepth, NewIndex,
-        /*EvaluateConstraint=*/EvaluateConstraint);
-  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, TTP, Args, NewIndex, NewDepth);
+    return transformTemplateParam(SemaRef, DC, TTP, Args, NewDepth, NewIndex,
+                                  EvaluateConstraint);
   if (auto *NTTP = dyn_cast<NonTypeTemplateParmDecl>(TemplateParam))
-    return transformTemplateParam(SemaRef, DC, NTTP, Args, NewIndex, NewDepth);
+    return transformTemplateParam(SemaRef, DC, NTTP, NewDepth, NewIndex, Args);
+  if (auto *TTP = dyn_cast<TemplateTemplateParmDecl>(TemplateParam))
+    return transformTemplateParam(SemaRef, DC, TTP, NewDepth, NewIndex, Args,
+                                  EvaluateConstraint);
   llvm_unreachable("Unhandled template parameter types");
 }
 
+TemplateParameterList *
+transformTemplateParameters(Sema &SemaRef, DeclContext *DC,
+                            TemplateParameterList *TPL,
+                            MultiLevelTemplateArgumentList &Args,
+                            unsigned NewDepth, bool EvaluateConstraint) {
+  SmallVector<NamedDecl *, 4> Params(TPL->size());
+  for (unsigned I = 0, E = TPL->size(); I < E; ++I) {
+    Params[I] = transformTemplateParameter(SemaRef, DC, TPL->getParam(I), Args,
+                                           /*NewIndex=*/I, NewDepth,
+                                           EvaluateConstraint);
+  }
+  return TemplateParameterList::Create(
+      SemaRef.Context, TPL->getTemplateLoc(), TPL->getLAngleLoc(), Params,
+      TPL->getRAngleLoc(), TPL->getRequiresClause());
+}
+
 /// Transform to convert portions of a constructor declaration into the
 /// corresponding deduction guide, per C++1z [over.match.class.deduct]p1.
 struct ConvertConstructorToDeductionGuideTransform {
diff --git a/clang/test/AST/ast-dump-ctad-alias.cpp b/clang/test/AST/ast-dump-ctad-alias.cpp
index ea4b12da8ef78..4ca84dc2d1828 100644
--- a/clang/test/AST/ast-dump-ctad-alias.cpp
+++ b/clang/test/AST/ast-dump-ctad-alias.cpp
@@ -151,7 +151,7 @@ ATemplatedClass2 test2(list);
 //
 // CHECK:      FunctionTemplateDecl {{.*}} <deduction guide for ATemplatedClass2>
 // CHECK-NEXT: |-TemplateTemplateParmDecl {{.*}} depth 0 index 0 T2
-// CHECK-NEXT: | `-TemplateTypeParmDecl {{.*}} typename depth 0 index 0
+// CHECK-NEXT: | `-TemplateTypeParmDecl {{.*}} typename depth 1 index 0
 // CHECK-NEXT: |-TypeTraitExpr {{.*}} 'bool' __is_deducible
 
 } // namespace GH90209
diff --git a/clang/test/SemaTemplate/deduction-guide.cpp b/clang/test/SemaTemplate/deduction-guide.cpp
index 9350a9a7c0c81..3715a496faa63 100644
--- a/clang/test/SemaTemplate/deduction-guide.cpp
+++ b/clang/test/SemaTemplate/deduction-guide.cpp
@@ -821,11 +821,11 @@ BB b{};
 // CHECK-LABEL: Dumping GH133132::<deduction guide for BB>:
 // CHECK-NEXT:  FunctionTemplateDecl {{.+}} implicit <deduction guide for BB>
 // CHECK-NEXT:  |-TemplateTemplateParmDecl {{.+}} depth 0 index 0 _X
-// CHECK-NEXT:  | |-TemplateTypeParmDecl {{.+}} class depth 0 index 0
+// CHECK-NEXT:  | |-TemplateTypeParmDecl {{.+}} class depth 1 index 0
 // CHECK-NEXT:  | `-TemplateArgument {{.+}} template 'A':'GH133132::A' qualified
 // CHECK-NEXT:  |   `-ClassTemplateDecl {{.+}} A
 // CHECK-NEXT:  |-TemplateTemplateParmDecl {{.+}} depth 0 index 1 _Y
-// CHECK-NEXT:  | |-TemplateTypeParmDecl {{.+}} class depth 0 index 0
+// CHECK-NEXT:  | |-TemplateTypeParmDecl {{.+}} class depth 1 index 0
 // CHECK-NEXT:  | `-TemplateArgument {{.+}} template '_X':'template-parameter-0-0'
 // CHECK-NEXT:  |   `-TemplateTemplateParmDecl {{.+}} depth 0 index 0 _X
 // CHECK-NEXT:  |-TypeTraitExpr {{.+}} 'bool' __is_deducible

From e42de9d83b56bb4a1f0ec55e55c95fe5190d85b8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20Benics?= <benicsbalazs@gmail.com>
Date: Thu, 14 May 2026 14:55:33 +0100
Subject: [PATCH 80/95] [docs] Add the Clang Static Analysis WG to sync-ups
 (#197679)

See
https://discourse.llvm.org/t/rfc-forming-a-static-analysis-working-group-in-the-clang-ecosystem/90719/17
---
 llvm/docs/GettingInvolved.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index bb54d5be8dc7a..9d6efc7673edf 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -231,6 +231,11 @@ what to add to your calendar invite.
      - Every 2nd Tuesday of the month
      - `gcal <https://calendar.google.com/calendar/u/0?cid=YWZjNzhmMzE4MDNlNTAyNGY1NmE1MDIyODY0YTYwZmJmYzRjYTEwNTE1NmUxODA2NzBkYTliY2ZhYTVkNjk0NUBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
      - `Meeting details/agenda <https://docs.google.com/document/d/1Ry8O4-Tm5BFj9AMjr8qTQFU80z-ptiNQ62687NaIvLs/edit?usp=sharing>`__
+   * - Clang Static Analysis Working Group
+     - Every 2 weeks on Tuesdays
+     - `ics <https://calendar.google.com/calendar/ical/9c23f3a54dbb4fbac3801c50094fc43118a37c186f5c65b2898cd0fc251c8610%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0?cid=OWMyM2YzYTU0ZGJiNGZiYWMzODAxYzUwMDk0ZmM0MzExOGEzN2MxODZmNWM2NWIyODk4Y2QwZmMyNTFjODYxMEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting notes <https://docs.google.com/document/d/1ijI8pWeyidmhFOd5Ndgvr5AziZwrMCbt2oUehv8qHmw/edit?usp=sharing>`__
 
 
 For event owners, our Discord bot also supports sending automated announcements

From 16b2ef32bef0e0f32ca89eca338d20b63df335d1 Mon Sep 17 00:00:00 2001
From: LU-JOHN <John.Lu@amd.com>
Date: Thu, 14 May 2026 09:04:59 -0500
Subject: [PATCH 81/95] [CodeGen] Debug insns must not affect liveness analysis
 (#193104)

Register references in debug instructions can affect LiveRegUnits
analysis. Skip over debug instructions.

Tests in this PR would fail due to calls to LiveRegUnits::stepBackward
in RegisterScavenging, DeadMachineInstructionElim, and
AArch64InstrInfo.cpp getOutlinableRanges().

Other call-sites to stepBackward may also pass debug instructions to
LiveRegUnits::stepBackward, but LIT testing did not fail when
-debugify-and-strip-all-safe was enabled by default.

---------

Signed-off-by: John Lu <John.Lu@amd.com>
---
 llvm/include/llvm/CodeGen/MachineOutliner.h   |  3 ++-
 .../CodeGen/DeadMachineInstructionElim.cpp    |  2 ++
 llvm/lib/CodeGen/LiveRegUnits.cpp             |  3 +++
 llvm/lib/CodeGen/RegisterScavenging.cpp       |  3 ++-
 llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp  |  3 ++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |  6 +++--
 .../lib/Target/SystemZ/SystemZShortenInst.cpp |  4 +--
 llvm/lib/Target/X86/X86FixupBWInsts.cpp       |  3 ++-
 .../AArch64/machine-outliner-calls.mir        |  1 +
 ...debug-independence-dead-mi-elimination.mir | 25 +++++++++++++++++++
 .../eliminate-frame-index-v-add-co-u32.mir    |  1 +
 llvm/test/CodeGen/ARM/flag-crash.ll           |  1 +
 llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll     |  1 +
 llvm/test/CodeGen/WebAssembly/simd-arith.ll   |  1 +
 14 files changed, 49 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/debug-independence-dead-mi-elimination.mir

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index 66cab3d652104..2f38017f10274 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -89,7 +89,8 @@ struct Candidate {
     // outlining candidate.
     for (auto &MI : make_range(MBB->rbegin(),
                                (MachineBasicBlock::reverse_iterator)begin()))
-      FromEndOfBlockToStartOfSeq.stepBackward(MI);
+      if (!MI.isDebugInstr())
+        FromEndOfBlockToStartOfSeq.stepBackward(MI);
   }
 
   /// Populate InSeq with liveness information.
diff --git a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
index 2c58b014d2399..67f924816e607 100644
--- a/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -105,6 +105,8 @@ bool DeadMachineInstructionElimImpl::eliminateDeadMI(
     // Now scan the instructions and delete dead ones, tracking physreg
     // liveness as we go.
     for (MachineInstr &MI : make_early_inc_range(reverse(*MBB))) {
+      if (MI.isDebugInstr())
+        continue;
       // If the instruction is dead, delete it!
       if (MI.isDead(*MRI, &LivePhysRegs)) {
         if (MI.isPHI()) {
diff --git a/llvm/lib/CodeGen/LiveRegUnits.cpp b/llvm/lib/CodeGen/LiveRegUnits.cpp
index 348ccd85f4c45..6ff4ef1c95e41 100644
--- a/llvm/lib/CodeGen/LiveRegUnits.cpp
+++ b/llvm/lib/CodeGen/LiveRegUnits.cpp
@@ -42,6 +42,9 @@ void LiveRegUnits::addRegsInMask(const uint32_t *RegMask) {
 }
 
 void LiveRegUnits::stepBackward(const MachineInstr &MI) {
+  assert(!MI.isDebugInstr() &&
+         "Debug instructions must not affect liveness calculation");
+
   // Remove defined registers and regmask kills from the set.
   for (const MachineOperand &MOP : MI.operands()) {
     if (MOP.isReg()) {
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index ff279bebd906b..1b6a0880eac9f 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -80,7 +80,8 @@ void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
 
 void RegScavenger::backward() {
   const MachineInstr &MI = *--MBBI;
-  LiveUnits.stepBackward(MI);
+  if (!MI.isDebugInstr())
+    LiveUnits.stepBackward(MI);
 
   // Expire scavenge spill frameindex uses.
   for (ScavengedInfo &I : Scavenged) {
diff --git a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
index 98d531c9982a0..8888bf792b279 100644
--- a/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
+++ b/llvm/lib/CodeGen/RemoveLoadsIntoFakeUses.cpp
@@ -187,7 +187,8 @@ bool RemoveLoadsIntoFakeUses::run(MachineFunction &MF) {
               RegFakeUses.erase(&FakeUse);
         }
       }
-      LivePhysRegs.stepBackward(MI);
+      if (!MI.isDebugInstr())
+        LivePhysRegs.stepBackward(MI);
     }
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 46bb9649b12d7..31c83cdb08c5c 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -10640,7 +10640,8 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
   // SKIP: <unsafe use>
   auto FirstPossibleEndPt = MBB.instr_rbegin();
   for (; FirstPossibleEndPt != MBB.instr_rend(); ++FirstPossibleEndPt) {
-    LRU.stepBackward(*FirstPossibleEndPt);
+    if (!FirstPossibleEndPt->isDebugInstr())
+      LRU.stepBackward(*FirstPossibleEndPt);
     // Update flags that impact how we outline across the entire block,
     // regardless of safety.
     UpdateWholeMBBFlags(*FirstPossibleEndPt);
@@ -10656,7 +10657,8 @@ AArch64InstrInfo::getOutlinableRanges(MachineBasicBlock &MBB,
   // are dead (if there is any such point). Begin partitioning the MBB into
   // ranges.
   for (auto &MI : make_range(FirstPossibleEndPt, MBB.instr_rend())) {
-    LRU.stepBackward(MI);
+    if (!MI.isDebugInstr())
+      LRU.stepBackward(MI);
     UpdateWholeMBBFlags(MI);
     if (!AreAllUnsafeRegsDead()) {
       SaveRangeIfNonEmpty();
diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index 96a41487c87e3..991b2e45f1297 100644
--- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -373,8 +373,8 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       break;
     }
     }
-
-    LiveRegs.stepBackward(MI);
+    if (!MI.isDebugInstr())
+      LiveRegs.stepBackward(MI);
   }
 
   return Changed;
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index ffe3510af61ac..18819efd35e27 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -459,7 +459,8 @@ void X86FixupBWInstImpl::processBasicBlock(MachineFunction &MF,
       MIReplacements.push_back(std::make_pair(&MI, NewMI));
 
     // We're done with this instruction, update liveness for the next one.
-    LiveUnits.stepBackward(MI);
+    if (!MI.isDebugInstr())
+      LiveUnits.stepBackward(MI);
   }
 
   while (!MIReplacements.empty()) {
diff --git a/llvm/test/CodeGen/AArch64/machine-outliner-calls.mir b/llvm/test/CodeGen/AArch64/machine-outliner-calls.mir
index 700a5b228122f..ea995b002e976 100644
--- a/llvm/test/CodeGen/AArch64/machine-outliner-calls.mir
+++ b/llvm/test/CodeGen/AArch64/machine-outliner-calls.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=aarch64--- -run-pass=prologepilog -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64--- -run-pass=prologepilog -run-pass=machine-outliner -verify-machineinstrs %s -o - -debugify-and-strip-all-safe | FileCheck %s
 # RUN: llc -mtriple=aarch64-pc-windows-msvc -run-pass=prologepilog -run-pass=machine-outliner -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=WINDOWS
 --- |
   define void @baz() #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/debug-independence-dead-mi-elimination.mir b/llvm/test/CodeGen/AMDGPU/debug-independence-dead-mi-elimination.mir
new file mode 100644
index 0000000000000..a35eb170ea937
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/debug-independence-dead-mi-elimination.mir
@@ -0,0 +1,25 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=dead-mi-elimination -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=dead-mi-elimination -o - %s -debugify-and-strip-all-safe | FileCheck %s
+
+# Ensure that references in debug instructions to register results in dead
+# instructions do not prevent DCE.
+
+---
+name:            func
+tracksRegLiveness: true
+body:             |
+  bb.0:
+  liveins: $vgpr0
+    ; GCN-LABEL: name: func
+    ; GCN: liveins: $vgpr0
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: SI_RETURN implicit $vgpr0
+    ; CHECK-LABEL: name: func
+    ; CHECK: liveins: $vgpr0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+  $sgpr0 = S_MOV_B32 0
+  SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
index 79486d56c55ca..c7dcfe1e5754a 100644
--- a/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/eliminate-frame-index-v-add-co-u32.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=MUBUFW64,GFX7 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs -run-pass=prologepilog -debugify-and-strip-all-safe %s -o - | FileCheck -check-prefixes=MUBUFW64,GFX7 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=MUBUFW64,GFX8 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=MUBUFW64,GFX900 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs -run-pass=prologepilog %s -o - | FileCheck -check-prefixes=MUBUFW64,GFX90A %s
diff --git a/llvm/test/CodeGen/ARM/flag-crash.ll b/llvm/test/CodeGen/ARM/flag-crash.ll
index 83a6b2470c51a..056f097fd17f8 100644
--- a/llvm/test/CodeGen/ARM/flag-crash.ll
+++ b/llvm/test/CodeGen/ARM/flag-crash.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -O3 -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -relocation-model=pic
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 -relocation-model=pic -debugify-and-strip-all-safe
 ; PR7484
 
 %struct.gs_matrix = type { float, i32, float, i32, float, i32, float, i32, float, i32, float, i32 }
diff --git a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll
index 1d177b0a4ebbf..ce039cb1d5452 100644
--- a/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll
+++ b/llvm/test/CodeGen/Thumb2/ldr-str-imm12.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=all | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -arm-atomic-cfg-tidy=0 -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=all -debugify-and-strip-all-safe | FileCheck %s
 ; rdar://7352504
 
 %0 = type { i16, i8, i8 }
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index cb06ee84ec99c..5fcbd2bee5861 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s --check-prefix=SIMD128
+; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -debugify-and-strip-all-safe | FileCheck %s --check-prefix=SIMD128
 ; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 -fast-isel | FileCheck %s --check-prefix=SIMD128-FAST
 ; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers | FileCheck %s --check-prefix=NO-SIMD128
 ; RUN: llc < %s -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -fast-isel | FileCheck %s --check-prefix=NO-SIMD128-FAST

From d2de1d2664073d5d32580e52760589f1a2d15fc5 Mon Sep 17 00:00:00 2001
From: Maryam Moghadas <maryammo@ca.ibm.com>
Date: Thu, 14 May 2026 10:06:41 -0400
Subject: [PATCH 82/95] [PowerPC] Update base crypto builtins and intrinsics
 (#197017)

Update the base crypto builtins and LLVM intrinsics to drop the mma_
prefix. Also fix the builtin definitions for dmsha2hash, dmsha3hash,
and dmxxshapad to use the correct immediate constraints.
---
 clang/include/clang/Basic/BuiltinsPPC.def     | 30 ++++-----
 clang/lib/CodeGen/TargetBuiltins/PPC.cpp      |  2 +-
 clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c | 66 +++++++++----------
 .../CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c | 12 ++--
 .../Sema/PowerPC/ppc-dmf-mma-builtin-err.c    |  6 +-
 clang/test/Sema/builtins-ppc-crypto.c         | 15 +++++
 llvm/include/llvm/IR/IntrinsicsPowerPC.td     |  6 +-
 llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td  |  6 +-
 llvm/test/CodeGen/PowerPC/dmrp-spill.ll       |  6 +-
 llvm/test/CodeGen/PowerPC/mmaplus-crypto.ll   | 34 +++++-----
 10 files changed, 99 insertions(+), 84 deletions(-)

diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
index 0cb48c6d4fe93..7970163c15f72 100644
--- a/clang/include/clang/Basic/BuiltinsPPC.def
+++ b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -1131,35 +1131,35 @@ UNALIASED_CUSTOM_BUILTIN(disassemble_dmr, "vv*W1024*", false,
 UNALIASED_CUSTOM_BUILTIN(build_dmr, "vW1024*VVVVVVVV", false,
                          "mma,isa-future-instructions")
 
-UNALIASED_CUSTOM_BUILTIN(mma_dmsha2hash, "vW1024*W1024*Ii", true,
+UNALIASED_CUSTOM_BUILTIN(dmsha2hash, "vW1024*W1024*i1", true,
                          "mma,isa-future-instructions")
-UNALIASED_CUSTOM_BUILTIN(mma_dmsha3hash, "vW2048*Ii", true,
+UNALIASED_CUSTOM_BUILTIN(dmsha3hash, "vW2048*i31", true,
                          "mma,isa-future-instructions")
-UNALIASED_CUSTOM_BUILTIN(mma_dmxxshapad, "vW1024*VIiIiIi", true,
+UNALIASED_CUSTOM_BUILTIN(dmxxshapad, "vW1024*Vi3i1i3", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmsha256hash, mma_dmsha2hash, "vW1024*W1024*", true,
+CUSTOM_BUILTIN(dmsha256hash, dmsha2hash, "vW1024*W1024*", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmsha512hash, mma_dmsha2hash, "vW1024*W1024*", true,
+CUSTOM_BUILTIN(dmsha512hash, dmsha2hash, "vW1024*W1024*", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmsha3dw, mma_dmsha3hash, "vW2048*", true,
+CUSTOM_BUILTIN(dmsha3dw, dmsha3hash, "vW2048*", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmcryshash, mma_dmsha3hash, "vW2048*", true,
+CUSTOM_BUILTIN(dmcryshash, dmsha3hash, "vW2048*", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxsha3512pad, mma_dmxxshapad, "vW1024*Vi1", true,
+CUSTOM_BUILTIN(dmxxsha3512pad, dmxxshapad, "vW1024*Vi1", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxsha3384pad, mma_dmxxshapad, "vW1024*Vi1", true,
+CUSTOM_BUILTIN(dmxxsha3384pad, dmxxshapad, "vW1024*Vi1", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxsha3256pad, mma_dmxxshapad, "vW1024*Vi1", true,
+CUSTOM_BUILTIN(dmxxsha3256pad, dmxxshapad, "vW1024*Vi1", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxsha3224pad, mma_dmxxshapad, "vW1024*Vi1", true,
+CUSTOM_BUILTIN(dmxxsha3224pad, dmxxshapad, "vW1024*Vi1", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxshake256pad, mma_dmxxshapad, "vW1024*Vi1", true,
+CUSTOM_BUILTIN(dmxxshake256pad, dmxxshapad, "vW1024*Vi1", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxshake128pad, mma_dmxxshapad, "vW1024*Vi1", true,
+CUSTOM_BUILTIN(dmxxshake128pad, dmxxshapad, "vW1024*Vi1", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxsha384512pad, mma_dmxxshapad, "vW1024*V", true,
+CUSTOM_BUILTIN(dmxxsha384512pad, dmxxshapad, "vW1024*V", true,
                          "mma,isa-future-instructions")
-CUSTOM_BUILTIN(dmxxsha224256pad, mma_dmxxshapad, "vW1024*V", true,
+CUSTOM_BUILTIN(dmxxsha224256pad, dmxxshapad, "vW1024*V", true,
                          "mma,isa-future-instructions")
 
 // MMA builtins with positive/negative multiply/accumulate.
diff --git a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
index ee932eb8bb366..843d3e9dcf06f 100644
--- a/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/PPC.cpp
@@ -1236,7 +1236,7 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     switch (BuiltinID) {
     case PPC::BI__builtin_dmmr:
     case PPC::BI__builtin_dmxor:
-    case PPC::BI__builtin_mma_dmsha2hash: {
+    case PPC::BI__builtin_dmsha2hash: {
       Address Addr = EmitPointerWithAlignment(E->getArg(1));
       Ops[1] = Builder.CreateLoad(Addr);
       break;
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
index 9f408eb6dbb78..e54de4c5906f6 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-dmf.c
@@ -214,7 +214,7 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
 // CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -223,14 +223,14 @@ void test_dmf_basic2(char *p1, char *res1, char *res2,
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
 // AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
 void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char *resp) {
   __dmr1024 vdmr1 = *((__dmr1024 *)vdmrp1);
   __dmr1024 vdmr2 = *((__dmr1024 *)vdmrp2);
-  __builtin_mma_dmsha2hash(&vdmr1, &vdmr2, 1);
+  __builtin_dmsha2hash(&vdmr1, &vdmr2, 1);
   *((__dmr1024 *)resp) = vdmr1;
 }
 
@@ -238,7 +238,7 @@ void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA10:![0-9]+]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
 // CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA10]]
 // CHECK-NEXT:    ret void
 //
@@ -246,13 +246,13 @@ void test_dmsha2hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned char
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA10:![0-9]+]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> [[TMP0]], i32 4)
 // AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA10]]
 // AIX-NEXT:    ret void
 //
 void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
   __dmr2048 vdmrp = *((__dmr2048 *)vdmrpp);
-  __builtin_mma_dmsha3hash(&vdmrp, 4);
+  __builtin_dmsha3hash(&vdmrp, 4);
   *((__dmr2048 *)resp) = vdmrp;
 }
 
@@ -260,7 +260,7 @@ void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 3)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -268,13 +268,13 @@ void test_dmsha3hash(unsigned char *vdmrpp,  unsigned char *resp) {
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 5)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 1, i32 3)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
 void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned char *resp) {
   __dmr1024 vdmr = *((__dmr1024 *)vdmrp);
-  __builtin_mma_dmxxshapad(&vdmr, vc, 2, 1, 5);
+  __builtin_dmxxshapad(&vdmr, vc, 2, 1, 3);
   *((__dmr1024 *)resp) = vdmr;
 }
 
@@ -283,7 +283,7 @@ void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned cha
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 0)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 0)
 // CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -292,7 +292,7 @@ void test_dmxxshapad(unsigned char *vdmrp, vector unsigned char vc, unsigned cha
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 0)
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 0)
 // AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -308,7 +308,7 @@ void test_dmsha256hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned ch
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
 // CHECK-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -317,7 +317,7 @@ void test_dmsha256hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned ch
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP1]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    [[TMP1:%.*]] = load <1024 x i1>, ptr [[VDMRP2]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
+// AIX-NEXT:    [[TMP2:%.*]] = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> [[TMP0]], <1024 x i1> [[TMP1]], i32 1)
 // AIX-NEXT:    store <1024 x i1> [[TMP2]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -332,7 +332,7 @@ void test_dmsha512hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned ch
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA10]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> [[TMP0]], i32 0)
 // CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA10]]
 // CHECK-NEXT:    ret void
 //
@@ -340,7 +340,7 @@ void test_dmsha512hash(unsigned char *vdmrp1, unsigned char *vdmrp2, unsigned ch
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA10]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 0)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> [[TMP0]], i32 0)
 // AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA10]]
 // AIX-NEXT:    ret void
 //
@@ -354,7 +354,7 @@ void test_dmsha3dw(unsigned char *vdmrpp,  unsigned char *resp) {
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA10]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 12)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> [[TMP0]], i32 12)
 // CHECK-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA10]]
 // CHECK-NEXT:    ret void
 //
@@ -362,7 +362,7 @@ void test_dmsha3dw(unsigned char *vdmrpp,  unsigned char *resp) {
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRPP:%.*]], ptr noundef writeonly captures(none) initializes((0, 256)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <2048 x i1>, ptr [[VDMRPP]], align 256, !tbaa [[__DMR2048_TBAA10]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> [[TMP0]], i32 12)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> [[TMP0]], i32 12)
 // AIX-NEXT:    store <2048 x i1> [[TMP1]], ptr [[RESP]], align 256, !tbaa [[__DMR2048_TBAA10]]
 // AIX-NEXT:    ret void
 //
@@ -376,7 +376,7 @@ void test_dmcryshash(unsigned char *vdmrpp,  unsigned char *resp) {
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -384,7 +384,7 @@ void test_dmcryshash(unsigned char *vdmrpp,  unsigned char *resp) {
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 0)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -398,7 +398,7 @@ void test_dmxxsha3512pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 1)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 1)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -406,7 +406,7 @@ void test_dmxxsha3512pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 1)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 1)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -420,7 +420,7 @@ void test_dmxxsha3384pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 2)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 2)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -428,7 +428,7 @@ void test_dmxxsha3384pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 2)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 0, i32 2)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -442,7 +442,7 @@ void test_dmxxsha3256pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 3)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 3)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -450,7 +450,7 @@ void test_dmxxsha3256pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 3)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 0, i32 1, i32 3)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -464,7 +464,7 @@ void test_dmxxsha3224pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 0, i32 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 0, i32 0)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -472,7 +472,7 @@ void test_dmxxsha3224pad(unsigned char *vdmrp, vector unsigned char vc, unsigned
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 0, i32 0)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 0, i32 0)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -486,7 +486,7 @@ void test_dmxxshake256pad(unsigned char *vdmrp, vector unsigned char vc, unsigne
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 1, i32 1)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 1, i32 1)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -494,7 +494,7 @@ void test_dmxxshake256pad(unsigned char *vdmrp, vector unsigned char vc, unsigne
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 1, i32 1)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 1, i32 1, i32 1)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -508,7 +508,7 @@ void test_dmxxshake128pad(unsigned char *vdmrp, vector unsigned char vc, unsigne
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 0, i32 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 0, i32 0)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -516,7 +516,7 @@ void test_dmxxshake128pad(unsigned char *vdmrp, vector unsigned char vc, unsigne
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 0, i32 0)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 2, i32 0, i32 0)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
@@ -530,7 +530,7 @@ void test_dmxxsha384512pad(unsigned char *vdmrp, vector unsigned char vc, unsign
 // CHECK-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 3, i32 0, i32 0)
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 3, i32 0, i32 0)
 // CHECK-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // CHECK-NEXT:    ret void
 //
@@ -538,7 +538,7 @@ void test_dmxxsha384512pad(unsigned char *vdmrp, vector unsigned char vc, unsign
 // AIX-SAME: ptr noundef readonly captures(none) [[VDMRP:%.*]], <16 x i8> noundef [[VC:%.*]], ptr noundef writeonly captures(none) initializes((0, 128)) [[RESP:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // AIX-NEXT:  [[ENTRY:.*:]]
 // AIX-NEXT:    [[TMP0:%.*]] = load <1024 x i1>, ptr [[VDMRP]], align 128, !tbaa [[__DMR1024_TBAA7]]
-// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 3, i32 0, i32 0)
+// AIX-NEXT:    [[TMP1:%.*]] = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> [[TMP0]], <16 x i8> [[VC]], i32 3, i32 0, i32 0)
 // AIX-NEXT:    store <1024 x i1> [[TMP1]], ptr [[RESP]], align 128, !tbaa [[__DMR1024_TBAA7]]
 // AIX-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
index 8ff2f250b1f26..9fff7d86f989c 100644
--- a/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/CodeGen/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -25,9 +25,9 @@ void test_mma(unsigned char *vdmrpp, unsigned char *vdmrp, unsigned char *vpp, v
   __builtin_dmxor(&vdmr, (__dmr1024*)vpp);
   __builtin_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc);
   __builtin_disassemble_dmr(vdmrp, &vdmr);
-  __builtin_mma_dmsha2hash(&vdmr, &vdmr, 0);
-  __builtin_mma_dmsha3hash(&vdmrpair, 0);
-  __builtin_mma_dmxxshapad(&vdmr, vc, 0, 0, 0);
+  __builtin_dmsha2hash(&vdmr, &vdmr, 0);
+  __builtin_dmsha3hash(&vdmrpair, 0);
+  __builtin_dmxxshapad(&vdmr, vc, 0, 0, 0);
   __builtin_dmsha256hash(&vdmr, &vdmr);
   __builtin_dmsha512hash(&vdmr, &vdmr);
   __builtin_dmsha3dw(&vdmrpair);
@@ -52,9 +52,9 @@ void test_mma(unsigned char *vdmrpp, unsigned char *vdmrp, unsigned char *vpp, v
 // ISA_FUTURE: error: '__builtin_dmxor' needs target feature mma,isa-future-instructions
 // ISA_FUTURE: error: '__builtin_build_dmr' needs target feature mma,isa-future-instructions
 // ISA_FUTURE: error: '__builtin_disassemble_dmr' needs target feature mma,isa-future-instructions
-// CHECK: error: '__builtin_mma_dmsha2hash' needs target feature mma,isa-future-instructions
-// CHECK: error: '__builtin_mma_dmsha3hash' needs target feature mma,isa-future-instructions
-// CHECK: error: '__builtin_mma_dmxxshapad' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_dmsha2hash' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_dmsha3hash' needs target feature mma,isa-future-instructions
+// CHECK: error: '__builtin_dmxxshapad' needs target feature mma,isa-future-instructions
 // CHECK: error: '__builtin_dmsha256hash' needs target feature mma,isa-future-instructions
 // CHECK: error: '__builtin_dmsha512hash' needs target feature mma,isa-future-instructions
 // CHECK: error: '__builtin_dmsha3dw' needs target feature mma,isa-future-instructions
diff --git a/clang/test/Sema/PowerPC/ppc-dmf-mma-builtin-err.c b/clang/test/Sema/PowerPC/ppc-dmf-mma-builtin-err.c
index 893de251bb077..f81cabde2c62d 100644
--- a/clang/test/Sema/PowerPC/ppc-dmf-mma-builtin-err.c
+++ b/clang/test/Sema/PowerPC/ppc-dmf-mma-builtin-err.c
@@ -18,9 +18,9 @@ void test_mma(unsigned char *vdmrpp, unsigned char *vdmrp, unsigned char *vpp, v
   __builtin_dmxor(&vdmr, (__dmr1024*)vpp); // expected-error {{'__builtin_dmxor' needs target feature mma,isa-future-instructions}}
   __builtin_build_dmr(&vdmr, vc, vc, vc, vc, vc, vc, vc, vc); // expected-error {{'__builtin_build_dmr' needs target feature mma,isa-future-instructions}}
   __builtin_disassemble_dmr(vdmrp, &vdmr); // expected-error {{'__builtin_disassemble_dmr' needs target feature mma,isa-future-instructions}}
-  __builtin_mma_dmsha2hash(&vdmr, &vdmr, 0); // expected-error {{'__builtin_mma_dmsha2hash' needs target feature mma,isa-future-instructions}}
-  __builtin_mma_dmsha3hash(&vdmrpair, 0); // expected-error {{'__builtin_mma_dmsha3hash' needs target feature mma,isa-future-instructions}}
-  __builtin_mma_dmxxshapad(&vdmr, vc, 0, 0, 0); // expected-error {{'__builtin_mma_dmxxshapad' needs target feature mma,isa-future-instructions}}
+  __builtin_dmsha2hash(&vdmr, &vdmr, 0); // expected-error {{'__builtin_dmsha2hash' needs target feature mma,isa-future-instructions}}
+  __builtin_dmsha3hash(&vdmrpair, 0); // expected-error {{'__builtin_dmsha3hash' needs target feature mma,isa-future-instructions}}
+  __builtin_dmxxshapad(&vdmr, vc, 0, 0, 0); // expected-error {{'__builtin_dmxxshapad' needs target feature mma,isa-future-instructions}}
 
   // DMF VSX Vector bfloat16 GER 2x builtins.
 
diff --git a/clang/test/Sema/builtins-ppc-crypto.c b/clang/test/Sema/builtins-ppc-crypto.c
index 050e95aec971f..d7a42269097d7 100644
--- a/clang/test/Sema/builtins-ppc-crypto.c
+++ b/clang/test/Sema/builtins-ppc-crypto.c
@@ -10,6 +10,21 @@ void test_crypto(unsigned char *vdmrpp, unsigned char *vdmrp, unsigned char *vpp
   __vector_pair vp = *((__vector_pair *)vpp);
   int ia;
 
+  __builtin_dmsha2hash(&vdmr, &vdmr, 2);  // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  __builtin_dmsha2hash(&vdmr, &vdmr, -1);  // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  __builtin_dmsha2hash(&vdmr, &vdmr, ia);  // expected-error {{argument to '__builtin_dmsha2hash' must be a constant integer}}
+
+  __builtin_dmsha3hash(&vdmrpair, 32);  // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+  __builtin_dmsha3hash(&vdmrpair, -2);  // expected-error {{argument value -2 is outside the valid range [0, 31]}}
+  __builtin_dmsha3hash(&vdmrpair, ia);  // expected-error {{argument to '__builtin_dmsha3hash' must be a constant integer}}
+
+  __builtin_dmxxshapad(&vdmr, vc, 4, 0, 3);  // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  __builtin_dmxxshapad(&vdmr, vc, 3, 2, 3);  // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  __builtin_dmxxshapad(&vdmr, vc, 3, 1, -1);  // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  __builtin_dmxxshapad(&vdmr, vc, ia, 1, 1);  // expected-error {{argument to '__builtin_dmxxshapad' must be a constant integer}}
+  __builtin_dmxxshapad(&vdmr, vc, 0, ia, 1);  // expected-error {{argument to '__builtin_dmxxshapad' must be a constant integer}}
+  __builtin_dmxxshapad(&vdmr, vc, 0, 1, ia);  // expected-error {{argument to '__builtin_dmxxshapad' must be a constant integer}}
+
   __builtin_dmxxsha3512pad(&vdmr, vc, 2);  // expected-error {{argument value 2 is outside the valid range [0, 1]}}
   __builtin_dmxxsha3512pad(&vdmr, vc, ia);  // expected-error {{argument to '__builtin_dmxxsha3512pad' must be a constant integer}}
 
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index b955a9f081094..565c3e217f82e 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1915,16 +1915,16 @@ let TargetPrefix = "ppc" in {
   defm int_ppc_mma_pmdmxvf16gerx2 :
        PowerPC_MMA_DMR_Intrinsic<[llvm_v256i1_ty, llvm_v16i8_ty, llvm_i32_ty,
                                      llvm_i32_ty, llvm_i32_ty]>;
-  def int_ppc_mma_dmsha2hash :
+  def int_ppc_dmsha2hash :
       DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty,
                              llvm_v1024i1_ty, llvm_i32_ty],
                             [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
-  def int_ppc_mma_dmsha3hash :
+  def int_ppc_dmsha3hash :
       DefaultAttrsIntrinsic<[llvm_v2048i1_ty], [llvm_v2048i1_ty,
                              llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 
-  def int_ppc_mma_dmxxshapad :
+  def int_ppc_dmxxshapad :
       DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty,
                              llvm_v16i8_ty, llvm_i32_ty, llvm_i32_ty,
                              llvm_i32_ty], [IntrNoMem, ImmArg<ArgIndex<2>>,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
index 3a4bca420c071..17e082fe551fc 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFutureMMA.td
@@ -511,14 +511,14 @@ let Predicates = [MMA, IsISAFuture] in {
       : XForm_AT3_T1_AB3<
             31, 14, 177, (outs dmr:$AT), (ins dmr:$ATi, dmr:$AB, u1imm:$T),
             "dmsha2hash $AT, $AB, $T",
-            [(set v1024i1:$AT, (int_ppc_mma_dmsha2hash v1024i1:$ATi,
+            [(set v1024i1:$AT, (int_ppc_dmsha2hash v1024i1:$ATi,
                                    v1024i1:$AB, u1imm_timm:$T))]>,
         RegConstraint<"$ATi = $AT">;
   def DMSHA3HASH
       : XForm_ATp2_SR5<31, 15, 177, (outs dmrp:$ATp),
                        (ins dmrp:$ATpi, u5imm:$SR), "dmsha3hash $ATp, $SR",
                        [(set v2048i1:$ATp,
-                           (int_ppc_mma_dmsha3hash v2048i1:$ATpi,
+                           (int_ppc_dmsha3hash v2048i1:$ATpi,
                                                    u5imm_timm:$SR))]>,
         RegConstraint<"$ATpi = $ATp">;
   def DMXXSHAPAD
@@ -593,7 +593,7 @@ let Predicates = [MMA, IsISAFuture] in {
             (DMXVF16GERX2NN $ATi, $XAp, RCCp.BToVSRC)>;
 
   // Cryptography Intrinsic
-  def : Pat<(v1024i1 (int_ppc_mma_dmxxshapad v1024i1:$ATi, v16i8:$XB,
+  def : Pat<(v1024i1 (int_ppc_dmxxshapad v1024i1:$ATi, v16i8:$XB,
                 u2imm_timm:$ID, u1imm_timm:$E, u2imm_timm:$BL)),
             (DMXXSHAPAD $ATi, RCCp.BToVSRC, $ID, $E, $BL)>;
 }
diff --git a/llvm/test/CodeGen/PowerPC/dmrp-spill.ll b/llvm/test/CodeGen/PowerPC/dmrp-spill.ll
index 7a26c49b89df5..88afec18f7b1d 100644
--- a/llvm/test/CodeGen/PowerPC/dmrp-spill.ll
+++ b/llvm/test/CodeGen/PowerPC/dmrp-spill.ll
@@ -10,7 +10,7 @@
 ; RUN:   -ppc-vsr-nums-as-vr -mcpu=future < %s | FileCheck %s --check-prefix=AIX32
 
 declare void @dummy_func()
-declare <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1>, i32)
+declare <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1>, i32)
 
 define dso_local void @test_dmsha3hash(ptr %vopp, ptr %resp) nounwind {
 ; CHECK-LABEL: test_dmsha3hash:
@@ -205,9 +205,9 @@ define dso_local void @test_dmsha3hash(ptr %vopp, ptr %resp) nounwind {
 ; AIX32-NEXT:    blr
   entry:
     %0 = load <2048 x i1>, ptr %vopp, align 64
-    %2 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 5)
+    %2 = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> %0, i32 5)
     tail call void @dummy_func()
-    %3 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 5)
+    %3 = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> %0, i32 5)
     store <2048 x i1> %2, ptr %resp, align 64
     ret void
 }
diff --git a/llvm/test/CodeGen/PowerPC/mmaplus-crypto.ll b/llvm/test/CodeGen/PowerPC/mmaplus-crypto.ll
index ceecdcb136fb8..e30dcffd10a24 100644
--- a/llvm/test/CodeGen/PowerPC/mmaplus-crypto.ll
+++ b/llvm/test/CodeGen/PowerPC/mmaplus-crypto.ll
@@ -6,7 +6,7 @@
 ; RUN:   -mcpu=future -ppc-asm-full-reg-names \
 ; RUN:   -ppc-vsr-nums-as-vr < %s | FileCheck %s --check-prefix=CHECK-BE
 
-declare <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1>, <1024 x i1>, i32)
+declare <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1>, <1024 x i1>, i32)
 
 define dso_local void @test_dmsha2hash(ptr %vop, ptr %vinp, ptr %resp) {
 ; CHECK-LABEL: test_dmsha2hash:
@@ -57,12 +57,12 @@ define dso_local void @test_dmsha2hash(ptr %vop, ptr %vinp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vop, align 64
   %1 = load <1024 x i1>, ptr %vinp, align 64
-  %3 = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> %0, <1024 x i1> %1, i32 0)
+  %3 = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> %0, <1024 x i1> %1, i32 0)
   store <1024 x i1> %3, ptr %resp, align 64
   ret void
 }
 
-declare <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1>, i32)
+declare <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1>, i32)
 
 define dso_local void @test_dmsha3hash(ptr %vopp, ptr %resp) {
 ; CHECK-LABEL: test_dmsha3hash:
@@ -124,12 +124,12 @@ define dso_local void @test_dmsha3hash(ptr %vopp, ptr %resp) {
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load <2048 x i1>, ptr %vopp, align 64
-  %2 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 5)
+  %2 = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> %0, i32 5)
   store <2048 x i1> %2, ptr %resp, align 64
   ret void
 }
 
-declare <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1>, <16 x i8>, i32, i32, i32)
+declare <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1>, <16 x i8>, i32, i32, i32)
 
 define dso_local void @test_dmxxshapad(ptr %vopp, ptr %vcp, ptr %resp) {
 ; CHECK-LABEL: test_dmxxshapad:
@@ -170,7 +170,7 @@ define dso_local void @test_dmxxshapad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 2, i32 1, i32 3)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 2, i32 1, i32 3)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -224,7 +224,7 @@ define dso_local void @test_dmsha512hash(ptr %vop, ptr %vinp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vop, align 64
   %1 = load <1024 x i1>, ptr %vinp, align 64
-  %3 = tail call <1024 x i1> @llvm.ppc.mma.dmsha2hash(<1024 x i1> %0, <1024 x i1> %1, i32 1)
+  %3 = tail call <1024 x i1> @llvm.ppc.dmsha2hash(<1024 x i1> %0, <1024 x i1> %1, i32 1)
   store <1024 x i1> %3, ptr %resp, align 64
   ret void
 }
@@ -289,7 +289,7 @@ define dso_local void @test_dmsha3dw(ptr %vopp, ptr %resp) {
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load <2048 x i1>, ptr %vopp, align 64
-  %2 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 0)
+  %2 = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> %0, i32 0)
   store <2048 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -354,7 +354,7 @@ define dso_local void @test_dmcryshash(ptr %vopp, ptr %resp) {
 ; CHECK-BE-NEXT:    blr
 entry:
   %0 = load <2048 x i1>, ptr %vopp, align 64
-  %2 = tail call <2048 x i1> @llvm.ppc.mma.dmsha3hash(<2048 x i1> %0, i32 12)
+  %2 = tail call <2048 x i1> @llvm.ppc.dmsha3hash(<2048 x i1> %0, i32 12)
   store <2048 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -398,7 +398,7 @@ define dso_local void @test_dmxxsha3512pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 0)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 0)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -442,7 +442,7 @@ define dso_local void @test_dmxxsha3384pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 1)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 1)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -486,7 +486,7 @@ define dso_local void @test_dmxxsha3256pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 2)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 2)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -530,7 +530,7 @@ define dso_local void @test_dmxxsha3224pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 3)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 0, i32 1, i32 3)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -574,7 +574,7 @@ define dso_local void @test_dmxxshake256pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 1, i32 1, i32 0)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 1, i32 1, i32 0)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -618,7 +618,7 @@ define dso_local void @test_dmxxshake128pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 1, i32 1, i32 1)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 1, i32 1, i32 1)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -662,7 +662,7 @@ define dso_local void @test_dmxxsha384512pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 2, i32 0, i32 0)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 2, i32 0, i32 0)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }
@@ -706,7 +706,7 @@ define dso_local void @test_dmxxsha224256pad(ptr %vopp, ptr %vcp, ptr %resp) {
 entry:
   %0 = load <1024 x i1>, ptr %vopp, align 64
   %1 = load <16 x i8>, ptr %vcp, align 64
-  %2 = tail call <1024 x i1> @llvm.ppc.mma.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 3, i32 0, i32 0)
+  %2 = tail call <1024 x i1> @llvm.ppc.dmxxshapad(<1024 x i1> %0, <16 x i8> %1, i32 3, i32 0, i32 0)
   store <1024 x i1> %2, ptr %resp, align 64
   ret void
 }

From 290d0f673424b573a594e99f9fd5a9777619f585 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 14 May 2026 15:08:37 +0100
Subject: [PATCH 83/95] [LV][NFC] Remove instcombine from RUN lines in AArch64
 tests (#197448)

This PR continues other work I've been doing trying to remove
unnecessary extra passes from the RUN lines in order to make it easier
to map the expected vectoriser output to the CHECK lines. As a result it
has exposed some potential optimisations that we may be able to perform
in VPlan.

Here is a summary of the changes I've noticed:

1. instcombine likes to canonicalise GEPs into certain forms. I'm not
sure if there is value in VPlan trying to guess what the canonical form
should be.
2. In tests like sve-cond-inv-loads.ll, etc. the pattern sub(urem) is
often replaced with and(sub). This is potentially something the
vectoriser could improve although I don't know if it would change the
cost model.
3. There is poor codegen in gather_nxv4i32_ind64_stride2 in the file
sve-gather-scatter.ll, which is due to
VPScalarIVStepsRecipe::execute. I have a PR that attempts to clean up
this IR: #197169.
4. Simple missing fold in sve-inductions.ll for icmp(and(x,1), 0) ->
trunc(x) to i1
5. Missing nsw flag - see sve-interleaved-accesses.ll. I think this
might be due to the range of vscale.
6. Missing fold in sve-interleaved-masked-accesses.ll for
select(icmp(slt, x, y), y, x) -> smax
7. Missing folds for reverse transformations of uniform operations, e.g.
see sve-vector-reverse.ll for things like reverse(fadd(reverse(x)))
8. Removal of xor when used by the exit condition - see sve-vfabi.ll.
There isn't much we can do about this because VPlan requires successors
to be in a certain order, therefore the non-zero cost xor instruction
has to be present.
9. See PR #196562 for fixes to some poor code in
uniform-args-call-variants.ll
10. instcombine tends to narrow reduction PHI nodes generated by VPlan
when there are extends involved. See reduction_i8 in
reduction-small-size.ll. In this case perhaps the original scalar loop
is simply not in a canonical form to start with?
---
 .../AArch64/reduction-small-size.ll           | 115 +++---
 .../AArch64/sve-cond-inv-loads.ll             |  36 +-
 .../AArch64/sve-extract-last-veclane.ll       |  11 +-
 .../AArch64/sve-gather-scatter.ll             |  69 ++--
 .../LoopVectorize/AArch64/sve-inductions.ll   |  11 +-
 .../AArch64/sve-interleaved-accesses.ll       | 358 +++++++++---------
 .../sve-interleaved-masked-accesses.ll        | 284 +++++++-------
 .../AArch64/sve-large-strides.ll              |   8 +-
 .../AArch64/sve-masked-loadstore.ll           |   6 +-
 .../sve-tail-folding-overflow-checks.ll       |  20 +-
 .../AArch64/sve-vector-reverse.ll             |  73 ++--
 .../LoopVectorize/AArch64/sve-vfabi.ll        |  28 +-
 .../LoopVectorize/AArch64/sve-widen-phi.ll    |  84 ++--
 .../AArch64/sve2-histcnt-epilogue.ll          |  16 +-
 .../LoopVectorize/AArch64/sve2-histcnt.ll     | 167 ++++----
 .../AArch64/uniform-args-call-variants.ll     | 123 +++---
 .../AArch64/vector-reverse-mask4.ll           |  35 +-
 17 files changed, 756 insertions(+), 688 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
index 00d73a7ab6825..5eccda37f725a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reduction-small-size.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
@@ -19,59 +19,69 @@ define i8 @reduction_i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i3
 ; CHECK-NEXT:    [[CMP_12:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP_12]], label %[[ITER_CHECK:.*]], [[FOR_COND_CLEANUP:label %.*]]
 ; CHECK:       [[ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP0]], 12
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483632
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i8> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i8> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4]] = add <16 x i8> [[TMP3]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = and <16 x i32> [[VEC_PHI]], splat (i32 255)
+; CHECK-NEXT:    [[TMP21:%.*]] = add <16 x i32> [[TMP3]], [[TMP12]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add <16 x i32> [[TMP21]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <16 x i32> [[TMP23]] to <16 x i8>
+; CHECK-NEXT:    [[TMP9]] = zext <16 x i8> [[TMP4]] to <16 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> [[TMP4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i8 [[TMP6]] to i32
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC4:%.*]] = and i64 [[TMP0]], 2147483644
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc nuw <4 x i32> [[TMP8]] to <4 x i8>
+; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i8> [ [[TMP9]], %[[VEC_EPILOG_PH]] ], [ [[TMP13:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP24]], %[[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP10]], align 1
+; CHECK-NEXT:    [[TMP25:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX5]]
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = add <4 x i8> [[VEC_PHI6]], [[WIDE_LOAD7]]
-; CHECK-NEXT:    [[TMP13]] = add <4 x i8> [[TMP12]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    [[TMP17:%.*]] = zext <4 x i8> [[WIDE_LOAD8]] to <4 x i32>
+; CHECK-NEXT:    [[TMP18:%.*]] = and <4 x i32> [[VEC_PHI6]], splat (i32 255)
+; CHECK-NEXT:    [[TMP19:%.*]] = add <4 x i32> [[TMP25]], [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = add <4 x i32> [[TMP19]], [[TMP17]]
+; CHECK-NEXT:    [[TMP13:%.*]] = trunc <4 x i32> [[TMP20]] to <4 x i8>
+; CHECK-NEXT:    [[TMP22]] = zext <4 x i8> [[TMP13]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> [[TMP13]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = zext i8 [[TMP15]] to i32
-; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[N_VEC4]], [[TMP0]]
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ;
@@ -124,28 +134,34 @@ define i16 @reduction_i16_1(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-NEXT:    [[CMP_16:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP_16]], label %[[FOR_BODY_PREHEADER:.*]], [[FOR_COND_CLEANUP:label %.*]]
 ; CHECK:       [[FOR_BODY_PREHEADER]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 8
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483640
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [2 x i8], ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP9:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [2 x i8], ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i16>, ptr [[TMP2]], align 2
-; CHECK-NEXT:    [[TMP3:%.*]] = add <8 x i16> [[VEC_PHI]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4]] = add <8 x i16> [[TMP3]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <8 x i16> [[WIDE_LOAD1]] to <8 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = and <8 x i32> [[VEC_PHI]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <8 x i32> [[TMP3]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <8 x i32> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc <8 x i32> [[TMP12]] to <8 x i16>
+; CHECK-NEXT:    [[TMP9]] = zext <8 x i16> [[TMP4]] to <8 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> [[TMP4]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = zext i16 [[TMP6]] to i32
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
 ;
@@ -198,64 +214,69 @@ define i16 @reduction_i16_2(ptr nocapture readonly %a, ptr nocapture readonly %b
 ; CHECK-NEXT:    [[CMP_14:%.*]] = icmp sgt i32 [[N]], 0
 ; CHECK-NEXT:    br i1 [[CMP_14]], label %[[ITER_CHECK:.*]], [[FOR_COND_CLEANUP:label %.*]]
 ; CHECK:       [[ITER_CHECK]]:
-; CHECK-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[N]] to i64
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
 ; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 16
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP0]], 12
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], 2147483632
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP6:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i16>
-; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP2]]
-; CHECK-NEXT:    [[TMP6]] = add <16 x i16> [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = and <16 x i32> [[VEC_PHI]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP15:%.*]] = add <16 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT:    [[TMP23:%.*]] = add <16 x i32> [[TMP15]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc <16 x i32> [[TMP23]] to <16 x i16>
+; CHECK-NEXT:    [[TMP10]] = zext <16 x i16> [[TMP6]] to <16 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[TMP8:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP6]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext i16 [[TMP8]] to i32
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[TMP0]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
 ; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
 ; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3]]
 ; CHECK:       [[VEC_EPILOG_PH]]:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP9]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC4:%.*]] = and i64 [[TMP0]], 2147483644
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> <i32 poison, i32 0, i32 0, i32 0>, i32 [[BC_MERGE_RDX]], i64 0
+; CHECK-NEXT:    [[N_MOD_VF3:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC4:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF3]]
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
 ; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
 ; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP10]], %[[VEC_EPILOG_PH]] ], [ [[TMP17:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <4 x i32> [ [[TMP24]], %[[VEC_EPILOG_PH]] ], [ [[TMP22:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX5]]
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8>, ptr [[TMP11]], align 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = zext <4 x i8> [[WIDE_LOAD7]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX5]]
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <4 x i8> [[WIDE_LOAD8]] to <4 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[VEC_PHI6]], [[TMP12]]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[TMP15]], [[TMP14]]
-; CHECK-NEXT:    [[TMP17]] = and <4 x i32> [[TMP16]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP17:%.*]] = and <4 x i32> [[TMP16]], splat (i32 65535)
+; CHECK-NEXT:    [[TMP25:%.*]] = add <4 x i32> [[TMP12]], [[TMP17]]
+; CHECK-NEXT:    [[TMP26:%.*]] = add <4 x i32> [[TMP25]], [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP26]] to <4 x i16>
+; CHECK-NEXT:    [[TMP22]] = zext <4 x i16> [[TMP19]] to <4 x i32>
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX5]], 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc <4 x i32> [[TMP16]] to <4 x i16>
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> [[TMP19]])
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP20]] to i32
-; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[N_VEC4]], [[TMP0]]
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC4]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE]], label %[[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 81f932d135f37..a4e5dc8f26bfd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:"
-; RUN: opt -passes=loop-vectorize,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve \
 ; RUN:   -tail-folding-policy=dont-fold-tail -S %s -o - | FileCheck %s
 
 define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocapture readonly %cond, ptr noalias nocapture readonly %inv, i64 %n) #0 {
@@ -11,20 +11,20 @@ define void @cond_inv_load_i32i32i16(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[INV:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr [4 x i8], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP7]], ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -69,19 +69,19 @@ define void @cond_inv_load_f64f64f64(ptr noalias nocapture %a, ptr noalias nocap
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[INV:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i8], ptr [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x double>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp ogt <vscale x 4 x double> [[WIDE_LOAD]], splat (double 4.000000e-01)
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x double> @llvm.masked.gather.nxv4f64.nxv4p0(<vscale x 4 x ptr> align 8 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x double> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [8 x i8], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4f64.p0(<vscale x 4 x double> [[WIDE_MASKED_GATHER]], ptr align 8 [[TMP7]], <vscale x 4 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -125,23 +125,23 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B:%.*]], i64 168
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 42
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x i8], ptr [[COND:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr [4 x i8], ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_LOAD]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr [4 x i8], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP9]], ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP7]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
index 68107d13f9358..834e915d4685e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=loop-vectorize,instcombine -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -S < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
@@ -9,8 +9,8 @@ define void @inv_store_last_lane(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK:  store <vscale x 4 x i32> %[[VEC_VAL:.*]], ptr
 ; CHECK: middle.block:
 ; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: %[[VSCALE2:.*]] = shl nuw i32 %[[VSCALE]], 2
-; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %[[VSCALE2:.*]] = mul nuw i32 %[[VSCALE]], 4
+; CHECK-NEXT: %[[LAST_LANE:.*]] = sub i32 %[[VSCALE2]], 1
 ; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
 
 entry:
@@ -39,10 +39,9 @@ define float @ret_last_lane(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK:  store <vscale x 4 x float> %[[VEC_VAL:.*]], ptr
 ; CHECK: middle.block:
 ; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT: %[[VSCALE2:.*]] = shl nuw i32 %[[VSCALE]], 2
-; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
+; CHECK-NEXT: %[[VSCALE2:.*]] = mul nuw i32 %[[VSCALE]], 4
+; CHECK-NEXT: %[[LAST_LANE:.*]] = sub i32 %[[VSCALE2]], 1
 ; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
-
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
index 06380db718e61..981427ece49db 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:"
-; RUN: opt -passes=loop-vectorize,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve \
 ; RUN:   -tail-folding-policy=dont-fold-tail -S %s -force-target-instruction-cost=1 -o - | FileCheck %s
 
 define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i64 %n) #0 {
@@ -11,17 +11,17 @@ define void @gather_nxv4i32_ind64(ptr noalias nocapture readonly %a, ptr noalias
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i8], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], <vscale x 4 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x float> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x i8], ptr [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -62,18 +62,18 @@ define void @scatter_nxv4i32_ind32(ptr noalias nocapture %a, ptr noalias nocaptu
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr [[C:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [4 x i8], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], <vscale x 4 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], <vscale x 4 x i64> [[TMP7]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4f32.nxv4p0(<vscale x 4 x float> [[WIDE_LOAD]], <vscale x 4 x ptr> align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -112,15 +112,15 @@ define void @scatter_inv_nxv4i32(ptr noalias nocapture %inv, ptr noalias nocaptu
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[INV:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 4 x i32> [[WIDE_LOAD]], zeroinitializer
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> splat (i32 3), <vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]])
@@ -164,15 +164,15 @@ define void @gather_inv_nxv4i32(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[INV:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr [4 x i8], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 3)
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[BROADCAST_SPLAT]], <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
@@ -214,40 +214,41 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[N:%.*]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP2]], 3
-; CHECK-NEXT:    [[DOTNEG:%.*]] = add nsw i64 [[TMP7]], -1
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[N_VEC:%.*]] = urem i64 [[N]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_VEC]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP7]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[N_VEC1:%.*]] = sub i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP4]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP9]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[DOTIDX1:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 [[DOTIDX1]]
-; CHECK-NEXT:    [[DOTIDX3:%.*]] = shl i64 [[TMP8]], 3
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[DOTIDX3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP11]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x float>, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x float>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[TMP12]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i64 [[TMP4]]
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC1]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[VECTOR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
index 9572719560dc0..03a5b9897bc82 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:"
-; RUN: opt -passes=loop-vectorize,instcombine -force-target-instruction-cost=1 \
+; RUN: opt -passes=loop-vectorize -force-target-instruction-cost=1 \
 ; RUN:   -tail-folding-policy=dont-fold-tail < %s -S | FileCheck %s
 
 target triple = "aarch64-linux-gnu"
@@ -30,17 +30,18 @@ define void @cond_ind64(ptr noalias nocapture %a, ptr noalias nocapture readonly
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <vscale x 4 x i64> [[VEC_IND]] to <vscale x 4 x i1>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr [4 x i8], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = and <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp ne <vscale x 4 x i64> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP10]], <vscale x 4 x i1> [[TMP9]], <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr [4 x i8], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], ptr align 4 [[TMP11]], <vscale x 4 x i1> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index d90f524113cbb..e9c4fd5c826b7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:"
-; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -mattr=+sve -scalable-vectorization=on -runtime-memory-check-threshold=24 < %s | FileCheck %s
+; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -mattr=+sve -scalable-vectorization=on -runtime-memory-check-threshold=24 < %s | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
@@ -27,7 +27,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
@@ -36,14 +36,14 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x i8], ptr @AB, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr @CD, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
@@ -100,31 +100,31 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP2]], splat (i64 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP2]], splat (i64 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [2 x i8], ptr @AB_i16, <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [2 x i8], ptr @AB_i16, <vscale x 4 x i64> [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[TMP7]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 2 [[TMP8]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr @CD, i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[TMP11]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
@@ -184,22 +184,22 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP2]], splat (i64 1)
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP2]], splat (i64 2)
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr @AB, i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -207,11 +207,11 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc <vscale x 4 x i32> [[TMP10]] to <vscale x 4 x i16>
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [2 x i8], ptr @CD_i16, <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP11]], <vscale x 4 x ptr> align 2 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 4 x i32> [[TMP13]] to <vscale x 4 x i16>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [2 x i8], ptr @CD_i16, <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> align 2 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT4]]
@@ -258,7 +258,7 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP1]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -267,22 +267,17 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [24 x i8], ptr [[S:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[S:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP5]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [24 x i8], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[DOTSPLIT6:%.*]] = getelementptr inbounds [24 x i8], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT6]], i64 8
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 2
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP7]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[DOTSPLIT7:%.*]] = getelementptr inbounds [24 x i8], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT7]], i64 12
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 3
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP8]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[DOTSPLIT8:%.*]] = getelementptr inbounds [24 x i8], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT8]], i64 16
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 4
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP9]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[DOTSPLIT9:%.*]] = getelementptr inbounds [24 x i8], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT9]], i64 20
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 5
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
@@ -295,6 +290,7 @@ define i32 @test_struct_load6(ptr %S) #1 {
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
 ;
@@ -358,11 +354,11 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[DOTNEG:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[DOTNEG:%.*]] = sub i32 0, [[TMP4]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTNEG]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -370,10 +366,10 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i8], ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = sub nsw i64 2, [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP5]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP8]], -2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -382,8 +378,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [8 x i8], ptr [[B:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP14]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
@@ -437,20 +433,23 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 512, [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 512, [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], splat (i32 1)
-; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[B:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP11:%.*]] = lshr exact i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP11]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -495,16 +494,15 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 2)
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[UMAX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp samesign ult i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp ule i64 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], -1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[TMP4]], [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]]
@@ -512,14 +510,14 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], splat (i32 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[B:%.*]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP15:%.*]] = lshr exact i64 [[TMP10]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP15]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -572,7 +570,7 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -585,9 +583,8 @@ define void @load_gap_reverse(ptr noalias nocapture readonly %P1, ptr noalias no
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT1]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [16 x i8], ptr [[P1:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [16 x i8], ptr [[P2:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> align 8 [[TMP6]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i64> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> align 8 [[TMP5]], <vscale x 4 x i1> splat (i1 true))
@@ -636,19 +633,20 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[B:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
@@ -722,13 +720,13 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [8 x i8], ptr [[P:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -740,6 +738,8 @@ define void @int_float_struct(ptr nocapture readonly %p) #0 {
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
 ;
@@ -787,19 +787,18 @@ for.body:
 define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-LABEL: @PR27626_0(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp sgt i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ule i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], -1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -807,12 +806,11 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [8 x i8], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [8 x i8], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -823,7 +821,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[VECTOR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -858,30 +856,28 @@ for.end:
 define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-LABEL: @PR27626_1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp sgt i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ule i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], -1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [8 x i8], ptr [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [8 x i8], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[SCALAR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -897,7 +893,7 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP17]])
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[VECTOR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -934,19 +930,18 @@ for.end:
 define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-LABEL: @PR27626_2(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp sgt i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ule i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], -1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
@@ -954,12 +949,12 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [8 x i8], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [8 x i8], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT]], i64 4
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[INDEX]], -1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP12]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
@@ -970,7 +965,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[VECTOR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -1006,33 +1001,30 @@ for.end:
 define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-LABEL: @PR27626_3(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp sgt i64 [[N:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ule i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 1)
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[TMP3]], -1
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = and i64 [[SMAX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[SCALAR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[SCALAR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[SCALAR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [8 x i8], ptr [[P:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds [8 x i8], ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds nuw i8, ptr [[DOTSPLIT]], i64 4
-; CHECK-NEXT:    [[DOTSPLIT3:%.*]] = getelementptr inbounds [8 x i8], ptr [[P]], <vscale x 4 x i64> [[TMP12]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, <vscale x 4 x ptr> [[DOTSPLIT3]], i64 4
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[TMP12]], i32 1
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -1047,7 +1039,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
-; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK-NEXT:    br label [[VECTOR_PH]]
 ; CHECK:       scalar.ph:
 ;
 entry:
@@ -1094,14 +1086,14 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP7]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNOT]]
-; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
@@ -1109,15 +1101,15 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP10]], splat (i64 1)
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP6]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP10]], splat (i64 2)
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP7]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[P:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP13]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
@@ -1170,15 +1162,15 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP8]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNOT]]
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP11]], 3
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP8]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = add i64 3, [[TMP9]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Y:%.*]], i64 0
@@ -1186,20 +1178,20 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP10]], splat (i64 1)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> [[TMP19]], splat (i64 3)
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP7]], 3
+; CHECK-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP10]], splat (i64 2)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add nuw nsw <vscale x 4 x i64> splat (i64 3), [[TMP14]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP8]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT5]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 -1)
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 -3)
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], <vscale x 4 x i64> [[TMP13]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = sub <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP13:%.*]] = sub <vscale x 4 x i64> [[TMP19]], splat (i64 2)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP13]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> align 4 [[TMP16]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> align 4 [[TMP17]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> align 4 [[TMP15]], <vscale x 4 x i1> splat (i1 true))
@@ -1253,50 +1245,50 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N]], 1
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], -4
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i64 [[N]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[TMP8]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP6]], i64 4
 ; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr nuw i8, ptr [[A]], i64 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP5]]
-; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[TMP7]], i64 6
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP8]], 6
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP7]]
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP2]]
-; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP1]], [[TMP6]]
 ; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP10]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP10]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw <vscale x 4 x i64> [[TMP14]], splat (i64 1)
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP9]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP14]], splat (i64 2)
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP10]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP13]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP33]], 2
-; CHECK-NEXT:    [[TMP34:%.*]] = add nsw i32 [[TMP16]], -1
+; CHECK-NEXT:    [[TMP16:%.*]] = mul nuw i32 [[TMP33]], 4
+; CHECK-NEXT:    [[TMP34:%.*]] = sub i32 [[TMP16]], 1
 ; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP34]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP15]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 1)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], splat (i64 2)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [2 x i8], ptr [[A]], <vscale x 4 x i64> [[TMP18]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP17]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [2 x i8], ptr [[A]], <vscale x 4 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP22]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i16> poison), !alias.scope [[META34]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.vector.splice.right.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 1)
 ; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
 ; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 4 x i32> [[TMP24]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = mul nsw <vscale x 4 x i32> [[TMP26]], [[TMP25]]
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP28]], align 4, !alias.scope [[META37:![0-9]+]], !noalias [[META34]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], [[BROADCAST_SPLAT]]
@@ -1304,8 +1296,8 @@ define void @PR34743(ptr %a, ptr %b, i64 %n) #1 {
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP31:%.*]] = shl nuw nsw i32 [[TMP30]], 2
-; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i32 [[TMP31]], -1
+; CHECK-NEXT:    [[TMP31:%.*]] = mul nuw i32 [[TMP30]], 4
+; CHECK-NEXT:    [[TMP32:%.*]] = sub i32 [[TMP31]], 1
 ; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 [[TMP32]]
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
@@ -1356,13 +1348,14 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK-LABEL: @interleave_deinterleave_factor3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP0]], 256
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.stepvector.nxv4i64()
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
@@ -1370,12 +1363,12 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [12 x i8], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds [[STRUCT_XYZ:%.*]], ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP19]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [12 x i8], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[B:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP20]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], [[TMP7]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [12 x i8], ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_XYZ]], ptr [[DST:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> align 4 [[TMP10]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds nuw i8, <vscale x 4 x ptr> [[TMP19]], i64 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 [[TMP21]], <vscale x 4 x i1> splat (i1 true), <vscale x 4 x i32> poison)
@@ -1396,7 +1389,7 @@ define void @interleave_deinterleave_factor3(ptr writeonly noalias %dst, ptr rea
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
@@ -1448,24 +1441,25 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
 ; CHECK-LABEL: @interleave_deinterleave(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP0]], 256
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [16 x i8], ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [16 x i8], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_VEC8:%.*]] = load <vscale x 16 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC8]])
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
@@ -1473,7 +1467,7 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 2
 ; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 3
 ; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP16]], [[TMP9]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [16 x i8], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub nsw <vscale x 4 x i32> [[TMP10]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP23:%.*]] = shl <vscale x 4 x i32> [[TMP11]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP24:%.*]] = ashr <vscale x 4 x i32> [[TMP12]], [[TMP19]]
@@ -1483,7 +1477,7 @@ define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
@@ -1547,11 +1541,11 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> splat (i32 1023), [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = sub i32 0, [[TMP5]]
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -1559,10 +1553,10 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [16 x i8], ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 4
-; CHECK-NEXT:    [[TMP9:%.*]] = sub nsw i64 4, [[TMP6]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP5]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = sub nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], -4
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 16 x i32>, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
@@ -1577,8 +1571,8 @@ define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A,
 ; CHECK-NEXT:    [[TMP18:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE3]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 4 x i32> [[REVERSE4]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl nuw nsw <vscale x 4 x i32> [[REVERSE5]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [16 x i8], ptr [[B:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds [4 x i8], ptr [[TMP21]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[REVERSE6:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-NEXT:    [[REVERSE7:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP18]])
 ; CHECK-NEXT:    [[REVERSE8:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP19]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index 56ddeb5c4019e..1362f87d64c5f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 2
-; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+sve -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on -tail-folding-policy=dont-fold-tail %s 2>&1 | FileCheck %s -check-prefix=SCALAR_TAIL_FOLDING
-; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize,instcombine -mattr=+sve -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on -tail-folding-policy=must-fold-tail %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING
+; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize -mattr=+sve -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on -tail-folding-policy=dont-fold-tail %s 2>&1 | FileCheck %s -check-prefix=SCALAR_TAIL_FOLDING
+; RUN: opt -mtriple=aarch64-none-linux-gnu -S -passes=loop-vectorize -mattr=+sve -force-vector-width=16 -force-vector-interleave=1 -scalable-vectorization=on -tail-folding-policy=must-fold-tail %s 2>&1 | FileCheck %s -check-prefix=PREDICATED_TAIL_FOLDING
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 
@@ -27,53 +27,53 @@ define void @masked_strided1(ptr noalias nocapture readonly %p, ptr noalias noca
 ; SCALAR_TAIL_FOLDING-NEXT:  entry:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1024, [[TMP1]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[INDEX]], 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i32 [[TMP6]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP5]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP11]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp slt <vscale x 16 x i8> [[TMP8]], [[TMP9]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = select <vscale x 16 x i1> [[TMP10]], <vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP8]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[Q]], i32 [[TMP6]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP13]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1024, [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SCALAR_TAIL_FOLDING:       scalar.ph:
 ;
 ; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided1
 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0:[0-9]+]] {
 ; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
@@ -90,25 +90,25 @@ define void @masked_strided1(ptr noalias nocapture readonly %p, ptr noalias noca
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i32 [[TMP5]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP7]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr align 1 [[TMP6]], <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = sext i32 [[TMP5]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP13]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp slt <vscale x 16 x i8> [[TMP7]], [[TMP8]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP7]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[Q]], i32 [[TMP5]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP10]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP12]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP12]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP11]], <vscale x 32 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = xor i1 [[TMP13]], true
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_TAIL_FOLDING:       for.end:
@@ -167,46 +167,45 @@ define void @masked_strided2(ptr noalias nocapture readnone %p, ptr noalias noca
 ; SCALAR_TAIL_FOLDING-NEXT:  entry:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1024, [[TMP1]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = zext nneg <vscale x 16 x i32> [[TMP4]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP5]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> splat (i1 true))
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP4]], splat (i32 1)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP5]], splat (i32 1)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP8]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP9]], <vscale x 16 x i1> [[TMP7]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1024, [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SCALAR_TAIL_FOLDING:       scalar.ph:
 ;
 ; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided2
 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readnone captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
 ; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
@@ -221,20 +220,19 @@ define void @masked_strided2(ptr noalias nocapture readnone %p, ptr noalias noca
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = zext nneg <vscale x 16 x i32> [[TMP3]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP4]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP5]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = or disjoint <vscale x 16 x i32> [[TMP3]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP8]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP4]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 16 x i32> [[TMP3]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP7]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP8]], <vscale x 16 x i1> [[TMP6]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = xor i1 [[TMP9]], true
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_TAIL_FOLDING:       for.end:
@@ -289,50 +287,49 @@ define void @masked_strided3(ptr noalias nocapture readnone %p, ptr noalias noca
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1024, [[TMP1]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT3]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = zext nneg <vscale x 16 x i32> [[TMP4]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP6]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP5]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> [[TMP6]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP4]], splat (i32 1)
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = zext nneg <vscale x 16 x i32> [[TMP9]] to <vscale x 16 x i64>
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP10]]
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP11]], <vscale x 16 x i1> [[TMP8]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP5]], splat (i32 1)
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP9]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP8]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1024, [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SCALAR_TAIL_FOLDING:       scalar.ph:
 ;
 ; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided3
 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readnone captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD1:%.*]], i8 zeroext [[GUARD2:%.*]]) #[[ATTR0]] {
 ; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
@@ -351,20 +348,19 @@ define void @masked_strided3(ptr noalias nocapture readnone %p, ptr noalias noca
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], splat (i32 1)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = zext nneg <vscale x 16 x i32> [[TMP3]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP6]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP7]], <vscale x 16 x i1> [[TMP5]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP8]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP3]], splat (i32 1)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP12]], <vscale x 16 x i1> [[TMP9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP3]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 1), <vscale x 16 x ptr> align 1 [[TMP6]], <vscale x 16 x i1> [[TMP5]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 16 x i32> [[TMP3]], splat (i32 1)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i32> [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> splat (i8 2), <vscale x 16 x ptr> align 1 [[TMP10]], <vscale x 16 x i1> [[TMP8]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP11]], true
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT4]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP13]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_TAIL_FOLDING:       for.end:
@@ -431,57 +427,58 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; SCALAR_TAIL_FOLDING-NEXT:  entry:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP0]], 64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 1024, [[TMP1]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.ph:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = shl nuw i32 [[TMP1]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub i32 1024, [[N_MOD_VF]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP2]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP3]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 2
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl i32 [[INDEX]], 2
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i32 [[TMP6]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP12]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP5]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP2]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = icmp slt <vscale x 16 x i8> [[TMP8]], [[TMP9]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = select <vscale x 16 x i1> [[TMP12]], <vscale x 16 x i8> [[TMP9]], <vscale x 16 x i8> [[TMP8]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp slt <vscale x 16 x i8> [[TMP10]], [[TMP11]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP10]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[Q]], i32 [[TMP6]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP16]], <vscale x 16 x i8> [[TMP17]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]], <vscale x 16 x i1> [[TMP5]])
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP18]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1024, [[N_VEC]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; SCALAR_TAIL_FOLDING:       scalar.ph:
 ;
 ; PREDICATED_TAIL_FOLDING-LABEL: define void @masked_strided_factor4
 ; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias readonly captures(none) [[P:%.*]], ptr noalias captures(none) [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) #[[ATTR0]] {
 ; PREDICATED_TAIL_FOLDING-NEXT:  entry:
+; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
-; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl nuw i32 [[TMP0]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
@@ -498,29 +495,30 @@ define void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noali
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP3]], <vscale x 16 x i1> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[INDEX]], 2
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP6]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[P]], i32 [[TMP5]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP7]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.masked.load.nxv64i8.p0(ptr align 1 [[TMP6]], <vscale x 64 x i1> [[INTERLEAVED_MASK]], <vscale x 64 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> [[WIDE_MASKED_VEC]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 2
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 3
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp slt <vscale x 16 x i8> [[TMP7]], [[TMP8]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = select <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP7]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP12]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP11]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP5]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp slt <vscale x 16 x i8> [[TMP9]], [[TMP10]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = select <vscale x 16 x i1> [[TMP14]], <vscale x 16 x i8> [[TMP10]], <vscale x 16 x i8> [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP15]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i32 [[TMP5]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> [[TMP12]], <vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP15]], <vscale x 16 x i8> [[TMP16]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK3:%.*]] = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]], <vscale x 16 x i1> [[TMP4]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv64i8.p0(<vscale x 64 x i8> [[INTERLEAVED_VEC]], ptr align 1 [[TMP17]], <vscale x 64 x i1> [[INTERLEAVED_MASK3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX_NEXT]], i32 1024)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = xor i1 [[TMP18]], true
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add nuw nsw <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP5:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br label [[FOR_END:%.*]]
 ; PREDICATED_TAIL_FOLDING:       for.end:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
index 19f01c827ffb1..a7c164fa7de16 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-large-strides.ll
@@ -1,4 +1,4 @@
-; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize,instcombine -S \
+; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -S \
 ; RUN:   -tail-folding-policy=dont-fold-tail <%s | FileCheck %s
 
 define void @stride7_i32(ptr noalias nocapture %dst, i64 %n) #0 {
@@ -6,7 +6,7 @@ define void @stride7_i32(ptr noalias nocapture %dst, i64 %n) #0 {
 ; CHECK:      vector.body
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 4 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
 ; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 4 x i64> %[[VEC_IND]], splat (i64 7)
-; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds [4 x i8], ptr %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
+; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds i32, ptr %dst, <vscale x 4 x i64> %[[PTR_INDICES]]
 ; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> align 4 %[[PTRS]]
 ; CHECK-NEXT:   %[[VALS:.*]] = add nsw <vscale x 4 x i32> %[[GLOAD]],
 ; CHECK-NEXT:   call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> %[[VALS]], <vscale x 4 x ptr> align 4 %[[PTRS]]
@@ -33,7 +33,7 @@ define void @stride7_f64(ptr noalias nocapture %dst, i64 %n) #0 {
 ; CHECK:      vector.body
 ; CHECK:        %[[VEC_IND:.*]] = phi <vscale x 2 x i64> [ %{{.*}}, %vector.ph ], [ %{{.*}}, %vector.body ]
 ; CHECK-NEXT:   %[[PTR_INDICES:.*]] = mul nuw nsw <vscale x 2 x i64> %[[VEC_IND]], splat (i64 7)
-; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds [8 x i8], ptr %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
+; CHECK-NEXT:   %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %[[PTR_INDICES]]
 ; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 %[[PTRS]],
 ; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
 ; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> align 8 %[[PTRS]],
@@ -60,7 +60,7 @@ define void @cond_stride7_f64(ptr noalias nocapture %dst, ptr noalias nocapture
 ; CHECK-LABEL: @cond_stride7_f64(
 ; CHECK:      vector.body
 ; CHECK:        %[[MASK:.*]] = icmp ne <vscale x 2 x i64>
-; CHECK:        %[[PTRS:.*]] = getelementptr inbounds [8 x i8], ptr %dst, <vscale x 2 x i64> %{{.*}}
+; CHECK:        %[[PTRS:.*]] = getelementptr inbounds double, ptr %dst, <vscale x 2 x i64> %{{.*}}
 ; CHECK-NEXT:   %[[GLOAD:.*]] = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64.nxv2p0(<vscale x 2 x ptr> align 8 %[[PTRS]], <vscale x 2 x i1> %[[MASK]]
 ; CHECK-NEXT:   %[[VALS:.*]] = fadd <vscale x 2 x double> %[[GLOAD]],
 ; CHECK-NEXT:  call void @llvm.masked.scatter.nxv2f64.nxv2p0(<vscale x 2 x double> %[[VALS]], <vscale x 2 x ptr> align 8 %[[PTRS]], <vscale x 2 x i1> %[[MASK]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
index 22a4aa05187ec..826617da6f903 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-masked-loadstore.ll
@@ -1,4 +1,4 @@
-; RUN: opt -passes=loop-vectorize,instcombine -mtriple aarch64-linux-gnu -mattr=+sve \
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -mattr=+sve \
 ; RUN:   -tail-folding-policy=dont-fold-tail -S %s -o - | FileCheck %s
 
 define void @mloadstore_f32(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
@@ -6,7 +6,7 @@ define void @mloadstore_f32(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK: vector.body:
 ; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x float>, ptr
 ; CHECK-NEXT:  %[[MASK:.*]] = fcmp ogt <vscale x 4 x float> %[[LOAD1]],
-; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr [4 x i8], ptr %a,
+; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr float, ptr %a,
 ; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x float> @llvm.masked.load.nxv4f32.p0(ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]]
 ; CHECK-NEXT:  %[[FADD:.*]] = fadd <vscale x 4 x float> %[[LOAD1]], %[[LOAD2]]
 ; CHECK-NEXT:  call void @llvm.masked.store.nxv4f32.p0(<vscale x 4 x float> %[[FADD]], ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]])
@@ -41,7 +41,7 @@ define void @mloadstore_i32(ptr noalias nocapture %a, ptr noalias nocapture read
 ; CHECK: vector.body:
 ; CHECK:       %[[LOAD1:.*]] = load <vscale x 4 x i32>, ptr
 ; CHECK-NEXT:  %[[MASK:.*]] = icmp ne <vscale x 4 x i32> %[[LOAD1]],
-; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr [4 x i8], ptr %a,
+; CHECK-NEXT:  %[[GEPA:.*]] = getelementptr i32, ptr %a,
 ; CHECK-NEXT:  %[[LOAD2:.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]]
 ; CHECK-NEXT:  %[[FADD:.*]] = add <vscale x 4 x i32> %[[LOAD1]], %[[LOAD2]]
 ; CHECK-NEXT:  call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %[[FADD]], ptr align 4 %[[GEPA]], <vscale x 4 x i1> %[[MASK]])
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
index b9462608f109f..9c206bdd7e24f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-overflow-checks.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:"
-; RUN: opt -passes='loop-vectorize,instcombine' -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S < %s | FileCheck %s
+; RUN: opt -passes='loop-vectorize' -sve-tail-folding-insn-threshold=0 -sve-tail-folding=all -S < %s | FileCheck %s
 
 target triple = "aarch64"
 
@@ -16,21 +16,22 @@ define void @cannot_overflow_i32_induction_var(ptr noalias %dst, ptr readonly %s
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [4 x i8], ptr [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP0]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x i8], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP1]], ptr align 4 [[TMP2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
 ; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP5]], true
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
@@ -73,21 +74,22 @@ define void @can_overflow_i64_induction_var(ptr noalias %dst, ptr readonly %src,
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x i8], ptr [[SRC:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP3]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[WIDE_MASKED_LOAD]], splat (i32 42)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr [[DST:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP8]], true
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
 ; CHECK:       for.cond.cleanup.loopexit:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
index 783149e642275..7c30ccc30de6e 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll
@@ -5,7 +5,7 @@
 ;  for (int i = N-1; i >= 0; --i)
 ;    a[i] = b[i] + 1.0;
 
-; RUN: opt -passes=loop-vectorize,instcombine -mtriple aarch64-linux-gnu -S \
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -S \
 ; RUN:   -tail-folding-policy=dont-fold-tail < %s | FileCheck %s
 
 define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{
@@ -21,33 +21,39 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[INDEX]], -1
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[N]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [8 x i8], ptr [[B:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 1, [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP9]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sub i64 [[N]], [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP11]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sub nuw nsw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 0, [[TMP9]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub i64 [[TMP12]], [[TMP5]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP9]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 [[TMP22]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x double>, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x double>, ptr [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = fadd <vscale x 8 x double> [[WIDE_LOAD]], splat (double 1.000000e+00)
-; CHECK-NEXT:    [[TMP17:%.*]] = fadd <vscale x 8 x double> [[WIDE_LOAD1]], splat (double 1.000000e+00)
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [8 x i8], ptr [[A:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP18]], i64 [[TMP12]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP18]], i64 [[TMP22]]
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> [[WIDE_LOAD1]])
+; CHECK-NEXT:    [[TMP18:%.*]] = fadd <vscale x 8 x double> [[REVERSE]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[TMP19:%.*]] = fadd <vscale x 8 x double> [[REVERSE2]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds double, ptr [[TMP21]], i64 [[TMP22]]
+; CHECK-NEXT:    [[TMP16:%.*]] = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> [[TMP18]])
+; CHECK-NEXT:    [[TMP17:%.*]] = call <vscale x 8 x double> @llvm.vector.reverse.nxv8f64(<vscale x 8 x double> [[TMP19]])
 ; CHECK-NEXT:    store <vscale x 8 x double> [[TMP16]], ptr [[TMP20]], align 8
 ; CHECK-NEXT:    store <vscale x 8 x double> [[TMP17]], ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
@@ -85,40 +91,47 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 7
+; CHECK-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP5]], 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = sub i64 [[B1]], [[A2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 3
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP7]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 1
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP9]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP14:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = xor i64 [[INDEX]], -1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[N]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [8 x i8], ptr [[B]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 1, [[TMP8]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP12]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP10:%.*]] = sub i64 [[N]], [[INDEX]]
+; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i64 [[TMP10]], -1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nuw nsw i64 [[TMP8]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = sub i64 0, [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = sub i64 [[TMP15]], [[TMP8]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP12]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP25]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i64>, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i64>, ptr [[TMP18]], align 8
-; CHECK-NEXT:    [[TMP19:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD]], splat (i64 1)
-; CHECK-NEXT:    [[TMP20:%.*]] = add <vscale x 8 x i64> [[WIDE_LOAD3]], splat (i64 1)
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [8 x i8], ptr [[A]], i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP21]], i64 [[TMP15]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP21]], i64 [[TMP25]]
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[REVERSE4:%.*]] = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> [[WIDE_LOAD3]])
+; CHECK-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i64> [[REVERSE]], splat (i64 1)
+; CHECK-NEXT:    [[TMP22:%.*]] = add <vscale x 8 x i64> [[REVERSE4]], splat (i64 1)
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i64 [[TMP15]]
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[TMP24]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP19:%.*]] = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> [[TMP21]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call <vscale x 8 x i64> @llvm.vector.reverse.nxv8i64(<vscale x 8 x i64> [[TMP22]])
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP19]], ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 8 x i64> [[TMP20]], ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
index fd139fb01400f..2dcd125fe6c7a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vfabi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -tail-folding-policy=must-fold-tail -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -tail-folding-policy=must-fold-tail -S | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -10,22 +10,24 @@ define void @test_big_little_params(ptr readonly %a, ptr readonly %b, ptr noalia
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ splat (i1 true), [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP2]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP3]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 4 x i32> @foo_vector(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i8> [[WIDE_MASKED_LOAD1]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> [[TMP4]], ptr align 4 [[TMP5]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
 ; CHECK:       exit:
@@ -58,22 +60,24 @@ define void @test_little_big_params(ptr readonly %a, ptr readonly %b, ptr noalia
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ splat (i1 true), [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [4 x i8], ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x float> @llvm.masked.load.nxv2f32.p0(ptr align 4 [[TMP2]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x float> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr [8 x i8], ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @bar_vector(<vscale x 2 x float> [[WIDE_MASKED_LOAD]], <vscale x 2 x double> [[WIDE_MASKED_LOAD1]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i8], ptr [[C]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[C]], i64 [[INDEX]]
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index a7bfe25ee56de..0c53f0ed96066 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize,instcombine -S \
+; RUN: opt -mtriple aarch64-linux-gnu -mattr=+sve -passes=loop-vectorize -S \
 ; RUN:   -tail-folding-policy=dont-fold-tail < %s | FileCheck %s
 
 ; Ensure that we can vectorize loops such as:
@@ -23,21 +23,22 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP5]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP26]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP4]], 5
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = add i64 [[TMP3]], 0
+; CHECK-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-NEXT:    [[TMP27:%.*]] = add i64 [[OFFSET_IDX]], [[TMP21]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP27]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
@@ -47,14 +48,14 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 1
 ; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], splat (i32 1)
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[TMP15]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], splat (i32 1)
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x i8], ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[TMP20]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP20]], i64 [[TMP3]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP22]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -70,15 +71,15 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[PTR_014:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR_014]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 1
 ; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[PTR_014]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds nuw i8, ptr [[PTR_014]], i64 8
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 2
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP24]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i8], ptr [[A]], i64 [[I_013]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_013]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[ADD2:%.*]] = add nsw i32 [[TMP25]], 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [4 x i8], ptr [[B]], i64 [[I_013]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[I_013]]
 ; CHECK-NEXT:    store i32 [[ADD2]], ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_013]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
@@ -131,10 +132,10 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP6]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP3]]
@@ -144,12 +145,12 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr [4 x i8], ptr [[NEXT_GEP]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], splat (i32 1)
 ; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD6]], splat (i32 1)
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr [4 x i8], ptr [[NEXT_GEP5]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[NEXT_GEP5]], i64 [[TMP4]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[NEXT_GEP5]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
@@ -170,8 +171,8 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[S_010]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP14]], 1
 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[D_09]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds nuw i8, ptr [[D_09]], i64 4
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds nuw i8, ptr [[S_010]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[D_09]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[S_010]], i64 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -212,13 +213,13 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP6]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[SMAX]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[SMAX]], [[TMP6]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[SMAX]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
@@ -238,7 +239,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[TMP12]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
 ; CHECK-NEXT:    store <vscale x 2 x ptr> [[VECTOR_GEP]], ptr [[NEXT_GEP]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP5]], 3
+; CHECK-NEXT:    [[TMP15:%.*]] = shl i64 [[TMP6]], 2
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP15]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
@@ -260,8 +261,8 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[VAR1:%.*]] = load i32, ptr [[P]], align 8
 ; CHECK-NEXT:    [[VAR2]] = add i32 [[VAR1]], [[VAR0]]
 ; CHECK-NEXT:    store ptr [[P]], ptr [[Q]], align 8
-; CHECK-NEXT:    [[VAR3]] = getelementptr inbounds nuw i8, ptr [[P]], i64 4
-; CHECK-NEXT:    [[VAR4]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 8
+; CHECK-NEXT:    [[VAR3]] = getelementptr inbounds i32, ptr [[P]], i32 1
+; CHECK-NEXT:    [[VAR4]] = getelementptr inbounds ptr, ptr [[Q]], i32 1
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -297,11 +298,12 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 2048
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl <vscale x 2 x i64> [[TMP4]], splat (i64 1)
 ; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[TMP5]]
@@ -309,7 +311,7 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[VECTOR_GEP]], splat (ptr null)
 ; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr align 2 [[TMP7]], <vscale x 2 x i1> [[TMP6]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -318,12 +320,20 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[IF_END:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[IF_END_SINK_SPLIT:%.*]], label [[IF_END1:%.*]]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[INC:%.*]], [[IF_END1:%.*]] ], [ 1024, [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV_PTR:%.*]] = phi ptr [ [[INCDEC_IV_PTR:%.*]], [[IF_END1]] ], [ [[TMP2]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp ne ptr [[IV_PTR]], null
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[IF_END_SINK_SPLIT:%.*]], label [[IF_END1]]
 ; CHECK:       if.end.sink.split:
+; CHECK-NEXT:    store i16 0, ptr [[IV_PTR]], align 2
 ; CHECK-NEXT:    br label [[IF_END1]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    br i1 poison, label [[IF_END]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    [[INCDEC_IV_PTR]] = getelementptr inbounds i16, ptr [[IV_PTR]], i64 1
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp ult i64 [[INC]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[IF_END]], label [[FOR_END]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       for.end:
+; CHECK-NEXT:    [[IV_PTR_1_LCSSA:%.*]] = phi ptr [ [[INCDEC_IV_PTR]], [[IF_END1]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret void
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
index 5eae4aecb28c6..bf276123d58db 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt-epilogue.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^for.body:" --version 3
-; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 \
+; RUN: opt < %s -mattr=+sve2 -passes=loop-vectorize -enable-histogram-loop-vectorization -sve-gather-overhead=2 \
 ; RUN:   -sve-scatter-overhead=2 -epilogue-vectorization-minimum-VF=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -10,12 +10,12 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH1:%.*]]
 ; CHECK:       vector.ph:
@@ -26,16 +26,16 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP14]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP15]], i32 1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP5]]
@@ -49,16 +49,16 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 2 x i32>, ptr [[TMP18]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 2 x i32> [[WIDE_LOAD5]] to <vscale x 2 x i64>
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 2 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 2 x i64> [[TMP19]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> [[TMP20]], i32 1, <vscale x 2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], [[TMP25]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[N_MOD_VF2]], 0
+; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 611143e287c22..5a7a332e13950 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter-out-after "^scalar.ph:" --version 3
-; RUN: opt < %s -passes=loop-vectorize,instcombine -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -enable-histogram-loop-vectorization -sve-gather-overhead=2 -sve-scatter-overhead=2 -debug-only=loop-vectorize -S 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 target triple = "aarch64-unknown-linux-gnu"
@@ -37,16 +37,16 @@ define void @simple_histogram(ptr noalias %buckets, ptr readonly %indices, i64 %
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP5]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -86,17 +86,20 @@ define void @simple_histogram_inc_param(ptr noalias %buckets, ptr readonly %indi
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP5]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[INCVAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
-; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 [[INCVAL]], <vscale x 4 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 4 x i32> [[BROADCAST_SPLAT]], i64 0
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP6]], i32 [[TMP7]], <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -135,16 +138,16 @@ define void @simple_histogram_sub(ptr noalias %buckets, ptr readonly %indices, i
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP5]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 -1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -184,17 +187,17 @@ define void @conditional_histogram(ptr noalias %buckets, ptr readonly %indices,
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP5]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i8], ptr [[CONDS]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[CONDS]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp sgt <vscale x 4 x i32> [[WIDE_LOAD1]], splat (i32 5100)
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[TMP13]])
@@ -241,24 +244,23 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N)
 ; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP2]], 3
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 4
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP6]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
-; CHECK-NEXT:    [[DOTNOT:%.*]] = add nsw i64 [[TMP4]], -1
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = urem i64 [[N]], [[TMP4]]
 ; CHECK-NEXT:    [[N_VEC1:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i32> [[WIDE_LOAD]] to <vscale x 16 x i64>
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 16 x i64> [[TMP8]]
@@ -267,7 +269,7 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N)
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC1]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 [[N_VEC]], 0
+; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
 ; CHECK-NEXT:    br i1 [[CMP_N1]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC]], [[TMP7]]
@@ -275,13 +277,13 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N)
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3
-; CHECK-NEXT:    [[DOTNOT1:%.*]] = sub nsw i64 0, [[TMP12]]
-; CHECK-NEXT:    [[N_VEC3:%.*]] = and i64 [[N]], [[DOTNOT1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw i64 [[TMP11]], 3
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], [[TMP12]]
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i64 [[N]], [[N_MOD_VF2]]
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX4]]
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i32> [[WIDE_LOAD5]] to <vscale x 8 x i64>
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 8 x i64> [[TMP14]]
@@ -297,10 +299,10 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N)
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
-; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[IV1]]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
 ; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[L_IDX]] to i64
-; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds nuw i8, ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], i64 [[IDXPROM1]]
 ; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i8, ptr [[GEP_BUCKET]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i8 [[L_BUCKET]], 1
 ; CHECK-NEXT:    store i8 [[INC]], ptr [[GEP_BUCKET]], align 4
@@ -338,10 +340,10 @@ define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[BUCKETS]], i64 [[IDXPROM1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
 ; CHECK-NEXT:    store float [[INC]], ptr [[ARRAYIDX2]], align 4
@@ -379,12 +381,12 @@ define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %ind
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[INCIDX:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[INCVALS]], i64 [[IV]]
+; CHECK-NEXT:    [[INCIDX:%.*]] = getelementptr inbounds i32, ptr [[INCVALS]], i64 [[IV]]
 ; CHECK-NEXT:    [[INCVAL:%.*]] = load i32, ptr [[INCIDX]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP1]], [[INCVAL]]
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
@@ -427,21 +429,21 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP5]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP5]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
-; CHECK-NEXT:    [[DOTIDX:%.*]] = shl nuw nsw i64 [[TMP4]], 4
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP8]], i64 [[DOTIDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP17]], align 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP19]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP19]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP21]], i32 1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
@@ -486,16 +488,16 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 {
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr @idx_array, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [1048576 x i32], ptr @idx_array, i64 0, i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD1]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [4 x i8], ptr @data_array, <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [1048576 x i32], ptr @data_array, i64 0, <vscale x 4 x i64> [[TMP14]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP11]], i32 1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -539,17 +541,16 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = sext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[DOTSPLIT:%.*]] = getelementptr inbounds nuw i8, ptr [[DATA_STRUCT]], i64 8388608
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [4 x i8], ptr [[DOTSPLIT]], <vscale x 4 x i64> [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[SOMESTRUCT:%.*]], ptr [[DATA_STRUCT]], i32 1, i32 0, <vscale x 4 x i64> [[TMP6]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
@@ -587,21 +588,22 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 2
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr align 4 [[TMP8]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i32> poison)
 ; CHECK-NEXT:    [[TMP9:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP9]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP10]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = xor i1 [[TMP11]], true
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK:       for.exit:
@@ -633,38 +635,39 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
 ; CHECK-LABEL: define void @simple_histogram_rtdepcheck(
 ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr [[ARRAY:%.*]], ptr [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INDICES2:%.*]] = ptrtoaddr ptr [[INDICES]] to i64
+; CHECK-NEXT:    [[ARRAY1:%.*]] = ptrtoaddr ptr [[ARRAY]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP1]], i64 8)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP2]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
-; CHECK-NEXT:    [[ARRAY1:%.*]] = ptrtoaddr ptr [[ARRAY]] to i64
-; CHECK-NEXT:    [[INDICES2:%.*]] = ptrtoaddr ptr [[INDICES]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nuw i64 [[TMP3]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP6]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 [[ARRAY1]], [[INDICES2]]
 ; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP8]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP8]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
-; CHECK-NEXT:    [[TMP11:%.*]] = trunc nuw nsw i64 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc i64 [[TMP8]] to i32
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [4 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [4 x i8], ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP13]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP14]], i32 1, <vscale x 4 x i1> splat (i1 true))
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [4 x i8], ptr [[ARRAY]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[ARRAY]], i64 [[INDEX]]
 ; CHECK-NEXT:    store <vscale x 4 x i32> [[VEC_IND]], ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
@@ -707,10 +710,10 @@ define void @simple_histogram_unsafe_alias(ptr %buckets, ptr %indices, i64 %N) #
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP12]] to i64
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4 x i8], ptr [[BUCKETS]], i64 [[IDXPROM1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[BUCKETS]], i64 [[IDXPROM1]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP13]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
@@ -750,15 +753,15 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 1
-; CHECK-NEXT:    [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 1
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [8 x i8], ptr [[INDICES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[INDICES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [8 x i8], ptr [[BUCKETS]], <vscale x 2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[BUCKETS]], <vscale x 2 x i64> [[WIDE_LOAD]]
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> splat (i1 true))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -796,10 +799,10 @@ define void @histogram_generates_vectors_crash(ptr %data_array, ptr noalias %ind
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr [4194304 x i8], ptr [[INDICES]], i64 [[IV]]
+; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr [1048576 x i32], ptr [[INDICES]], i64 [[IV]]
 ; CHECK-NEXT:    [[L_IDX:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
 ; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[L_IDX]] to i64
-; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr [4194304 x i8], ptr [[DATA_ARRAY]], i64 [[IDXPROM5]]
+; CHECK-NEXT:    [[GEP_BUCKET:%.*]] = getelementptr [1048576 x i32], ptr [[DATA_ARRAY]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[L_BUCKET:%.*]] = load i32, ptr [[GEP_BUCKET]], align 4
 ; CHECK-NEXT:    [[INC:%.*]] = add i32 [[L_BUCKET]], 1
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index 89c5553d3bc03..19e5e8e8cd88f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=1 -tail-folding-policy=must-fold-tail -S | FileCheck %s
-; RUN: opt < %s -passes=loop-vectorize,instcombine -force-vector-interleave=2 -tail-folding-policy=must-fold-tail -S | FileCheck %s --check-prefix=INTERLEAVE
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -tail-folding-policy=must-fold-tail -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -tail-folding-policy=must-fold-tail -S | FileCheck %s --check-prefix=INTERLEAVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -15,19 +15,23 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[UNIFORM]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP2]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr align 8 [[TMP4]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -40,30 +44,34 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP1]], i64 [[N]])
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[UNIFORM]], i64 0
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr [8 x i8], ptr [[TMP3]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP3]], i64 [[TMP1]]
 ; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP4]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP7]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP5]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i64> [[BROADCAST_SPLAT]], i64 0
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP7]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX_NEXT]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX_NEXT]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]])
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; INTERLEAVE-NEXT:    br i1 [[TMP10]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP10]], i64 [[N]])
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP11]], true
+; INTERLEAVE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; INTERLEAVE:       for.cond.cleanup:
@@ -96,19 +104,23 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[UNIFORM]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP2]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP3]], ptr align 8 [[TMP4]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 2 x i32> [[BROADCAST_SPLAT]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr align 8 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i1 [[TMP6]], true
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
@@ -121,30 +133,34 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; INTERLEAVE:       vector.ph:
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = shl nuw i64 [[TMP1]], 1
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP1]], i64 [[N]])
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[UNIFORM]], i64 0
+; INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr [8 x i8], ptr [[TMP3]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP3]], i64 [[TMP1]]
 ; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
 ; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr align 8 [[TMP4]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[TMP7]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP5]], ptr align 8 [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x i32> [[BROADCAST_SPLAT]], i64 0
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds double, ptr [[TMP8]], i64 [[TMP1]]
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr align 8 [[TMP8]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP7]], ptr align 8 [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP2]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX_NEXT]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX_NEXT]], [[TMP1]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 [[N]])
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]])
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; INTERLEAVE-NEXT:    br i1 [[TMP10]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP10]], i64 [[N]])
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP11]], true
+; INTERLEAVE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; INTERLEAVE:       for.cond.cleanup:
@@ -176,10 +192,10 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[GEPSRC:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[DATA:%.*]] = load double, ptr [[GEPSRC]], align 8
 ; CHECK-NEXT:    [[CALL:%.*]] = call double @foo(double [[DATA]], i64 [[INDVARS_IV]]) #[[ATTR4:[0-9]+]]
-; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds nuw [8 x i8], ptr [[DST]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[GEPDST:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    store double [[CALL]], ptr [[GEPDST]], align 8
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
@@ -192,37 +208,38 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; INTERLEAVE-NEXT:  entry:
 ; INTERLEAVE-NEXT:    br label [[VECTOR_PH:%.*]]
 ; INTERLEAVE:       vector.ph:
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ne i64 [[N]], 0
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ugt i64 [[N]], 1
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = icmp ult i64 0, [[N]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = icmp ult i64 1, [[N]]
 ; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; INTERLEAVE:       vector.body:
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ true, [[PRED_STORE_CONTINUE4]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi i1 [ [[ACTIVE_LANE_MASK_ENTRY1]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ]
 ; INTERLEAVE-NEXT:    br i1 [[ACTIVE_LANE_MASK]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; INTERLEAVE:       pred.store.if:
-; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = load double, ptr [[TMP0]], align 8
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call double @foo(double [[TMP1]], i64 [[INDEX]]) #[[ATTR4:[0-9]+]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    store double [[TMP2]], ptr [[TMP3]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; INTERLEAVE:       pred.store.continue:
 ; INTERLEAVE-NEXT:    br i1 [[ACTIVE_LANE_MASK2]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
 ; INTERLEAVE:       pred.store.if3:
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = or disjoint i64 [[INDEX]], 1
-; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr [8 x i8], ptr [[SRC]], i64 [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP4]]
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = load double, ptr [[TMP5]], align 8
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = call double @foo(double [[TMP6]], i64 [[TMP4]]) #[[ATTR4]]
-; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [8 x i8], ptr [[DST]], i64 [[TMP4]]
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP4]]
 ; INTERLEAVE-NEXT:    store double [[TMP7]], ptr [[TMP8]], align 8
 ; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE4]]
 ; INTERLEAVE:       pred.store.continue4:
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 3
-; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX_NEXT]], 1
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
 ; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = icmp ult i64 [[TMP9]], [[N]]
-; INTERLEAVE-NEXT:    br i1 [[ACTIVE_LANE_MASK_NEXT]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = xor i1 [[ACTIVE_LANE_MASK_NEXT]], true
+; INTERLEAVE-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; INTERLEAVE:       middle.block:
 ; INTERLEAVE-NEXT:    br label [[FOR_COND_CLEANUP:%.*]]
 ; INTERLEAVE:       for.cond.cleanup:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index aa32ac25b609c..efedb2b7ca5df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -11,7 +11,7 @@
 
 ; The test checks if the mask is being correctly created, reverted  and used
 
-; RUN: opt -passes=loop-vectorize,instcombine -mtriple aarch64-linux-gnu -S \
+; RUN: opt -passes=loop-vectorize -mtriple aarch64-linux-gnu -S \
 ; RUN:   -tail-folding-policy=dont-fold-tail < %s | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -26,31 +26,36 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], 9223372036854775800
-; CHECK-NEXT:    [[IND_END:%.*]] = and i64 [[N]], 7
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[INDEX]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[N]], [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [8 x i8], ptr [[COND:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[N]], [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i64 -3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP7]], i64 -7
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr [8 x i8], ptr [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP15]], i64 -3
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP15]], i64 -7
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]], <4 x double> poison)
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]], <4 x double> poison)
-; CHECK-NEXT:    [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], splat (double 1.000000e+00)
-; CHECK-NEXT:    [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[REVERSE6:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD6]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP16:%.*]] = fadd <4 x double> [[REVERSE6]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[TMP17:%.*]] = fadd <4 x double> [[REVERSE7]], splat (double 1.000000e+00)
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <4 x double> [[TMP16]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP11:%.*]] = shufflevector <4 x double> [[TMP17]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr align 8 [[TMP8]], <4 x i1> [[REVERSE3]])
 ; CHECK-NEXT:    call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr align 8 [[TMP9]], <4 x i1> [[REVERSE5]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -69,12 +74,12 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[I_08]] = add nsw i64 [[I_08_IN]], -1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i8], ptr [[COND]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[COND]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = fcmp une double [[TMP13]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [8 x i8], ptr [[A]], i64 [[I_08]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[A]], i64 [[I_08]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = load double, ptr [[ARRAYIDX1]], align 8
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd double [[TMP14]], 1.000000e+00
 ; CHECK-NEXT:    store double [[ADD]], ptr [[ARRAYIDX1]], align 8

From 2a110fe9fe03d37b196ce4111543adb603a9495c Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 14 May 2026 16:16:39 +0200
Subject: [PATCH 84/95] [lldb][windows] fix x86_64 arg register mapping for
 lldb-server (#197663)

---
 .../Utility/RegisterContextWindows_x86_64.cpp        | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lldb/source/Plugins/Process/Utility/RegisterContextWindows_x86_64.cpp b/lldb/source/Plugins/Process/Utility/RegisterContextWindows_x86_64.cpp
index bd483707ef88f..b5e255ead06df 100644
--- a/lldb/source/Plugins/Process/Utility/RegisterContextWindows_x86_64.cpp
+++ b/lldb/source/Plugins/Process/Utility/RegisterContextWindows_x86_64.cpp
@@ -90,14 +90,14 @@ static RegisterInfo g_register_infos_x86_64[] = {
 //  ===========================  ==================    ================      =========================   ====================
     DEFINE_GPR(rax,    nullptr,  dwarf_rax_x86_64,     dwarf_rax_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),
     DEFINE_GPR(rbx,    nullptr,  dwarf_rbx_x86_64,     dwarf_rbx_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),
-    DEFINE_GPR(rcx,    nullptr,  dwarf_rcx_x86_64,     dwarf_rcx_x86_64,     LLDB_REGNUM_GENERIC_ARG4,   LLDB_INVALID_REGNUM),
-    DEFINE_GPR(rdx,    nullptr,  dwarf_rdx_x86_64,     dwarf_rdx_x86_64,     LLDB_REGNUM_GENERIC_ARG3,   LLDB_INVALID_REGNUM),
-    DEFINE_GPR(rdi,    nullptr,  dwarf_rdi_x86_64,     dwarf_rdi_x86_64,     LLDB_REGNUM_GENERIC_ARG1,   LLDB_INVALID_REGNUM),
-    DEFINE_GPR(rsi,    nullptr,  dwarf_rsi_x86_64,     dwarf_rsi_x86_64,     LLDB_REGNUM_GENERIC_ARG2,   LLDB_INVALID_REGNUM),
+    DEFINE_GPR(rcx,    nullptr,  dwarf_rcx_x86_64,     dwarf_rcx_x86_64,     LLDB_REGNUM_GENERIC_ARG1,   LLDB_INVALID_REGNUM),
+    DEFINE_GPR(rdx,    nullptr,  dwarf_rdx_x86_64,     dwarf_rdx_x86_64,     LLDB_REGNUM_GENERIC_ARG2,   LLDB_INVALID_REGNUM),
+    DEFINE_GPR(rdi,    nullptr,  dwarf_rdi_x86_64,     dwarf_rdi_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),
+    DEFINE_GPR(rsi,    nullptr,  dwarf_rsi_x86_64,     dwarf_rsi_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),
     DEFINE_GPR(rbp,    nullptr,  dwarf_rbp_x86_64,     dwarf_rbp_x86_64,     LLDB_REGNUM_GENERIC_FP,     LLDB_INVALID_REGNUM),
     DEFINE_GPR(rsp,    nullptr,  dwarf_rsp_x86_64,     dwarf_rsp_x86_64,     LLDB_REGNUM_GENERIC_SP,     LLDB_INVALID_REGNUM),
-    DEFINE_GPR(r8,     nullptr,  dwarf_r8_x86_64,      dwarf_r8_x86_64,      LLDB_REGNUM_GENERIC_ARG5,   LLDB_INVALID_REGNUM),
-    DEFINE_GPR(r9,     nullptr,  dwarf_r9_x86_64,      dwarf_r9_x86_64,      LLDB_REGNUM_GENERIC_ARG6,   LLDB_INVALID_REGNUM),
+    DEFINE_GPR(r8,     nullptr,  dwarf_r8_x86_64,      dwarf_r8_x86_64,      LLDB_REGNUM_GENERIC_ARG3,   LLDB_INVALID_REGNUM),
+    DEFINE_GPR(r9,     nullptr,  dwarf_r9_x86_64,      dwarf_r9_x86_64,      LLDB_REGNUM_GENERIC_ARG4,   LLDB_INVALID_REGNUM),
     DEFINE_GPR(r10,    nullptr,  dwarf_r10_x86_64,     dwarf_r10_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),
     DEFINE_GPR(r11,    nullptr,  dwarf_r11_x86_64,     dwarf_r11_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),
     DEFINE_GPR(r12,    nullptr,  dwarf_r12_x86_64,     dwarf_r12_x86_64,     LLDB_INVALID_REGNUM,        LLDB_INVALID_REGNUM),

From d1a6d7bc106274019e23cca4b256fc58de7c7889 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 14 May 2026 15:16:44 +0100
Subject: [PATCH 85/95] [DAG] SimplifyMultipleUseDemandedBits - fold (mul X, 1)
 -> X (#197677)

Use DemandedElts + KnownBits to match hidden identity patterns - helps
especially with reduction patterns padded by legalisation

Once #197455 has landed, I'm intending to convert this (plus
SMIN/SMAX/UMIN/UMAX and the existing ISD::ADD case) to use
isIdentityElement directly.
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 10 +++++
 llvm/test/CodeGen/AArch64/aarch64-mulv.ll     |  9 ++---
 llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll | 37 +++++++------------
 llvm/test/CodeGen/X86/dpbusd_const.ll         |  4 +-
 llvm/test/CodeGen/X86/srem-vector-lkk.ll      | 26 ++++++-------
 5 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index ce1493200b9b1..7e43794ef224b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -831,6 +831,16 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
       return Op.getOperand(1);
     break;
   }
+  case ISD::MUL: {
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (RHSKnown.isConstant() && RHSKnown.getConstant().isOne())
+      return Op.getOperand(0);
+
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (LHSKnown.isConstant() && LHSKnown.getConstant().isOne())
+      return Op.getOperand(1);
+    break;
+  }
   case ISD::SHL: {
     // If we are only demanding sign bits then we can use the shift source
     // directly.
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
index f8d7cca916159..0740472c5e7ee 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-mulv.ll
@@ -390,12 +390,9 @@ entry:
 define i32 @mulv_v3i32(<3 x i32> %a) {
 ; CHECK-LABEL: mulv_v3i32:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    mov v1.16b, v0.16b
-; CHECK-NEXT:    mov w8, #1 // =0x1
-; CHECK-NEXT:    mov v1.s[3], w8
-; CHECK-NEXT:    mov d1, v1.d[1]
-; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    mul v0.2s, v0.2s, v0.s[1]
+; CHECK-NEXT:    mov d1, v0.d[1]
+; CHECK-NEXT:    mul v1.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mul v0.2s, v1.2s, v0.s[1]
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
index 1d495f5a7f01c..13457b1e2f254 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-mul.ll
@@ -1099,11 +1099,9 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX9-SDAG-LABEL: test_vector_reduce_mul_v3i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x5040100
-; GFX9-SDAG-NEXT:    v_perm_b32 v1, 1, v1, v2
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
 ; GFX9-SDAG-NEXT:    s_nop 0
-; GFX9-SDAG-NEXT:    v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_mul_lo_u16_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: test_vector_reduce_mul_v3i16:
@@ -1116,10 +1114,9 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX10-SDAG-LABEL: test_vector_reduce_mul_v3i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
+; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX10-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX10-SDAG-NEXT:    v_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: test_vector_reduce_mul_v3i16:
@@ -1133,21 +1130,18 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_mul_v3i16:
 ; GFX11-SDAG-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 1
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.h
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v3i16:
 ; GFX11-SDAG-FAKE16:       ; %bb.0: ; %entry
 ; GFX11-SDAG-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-FAKE16-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
-; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX11-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: test_vector_reduce_mul_v3i16:
@@ -1166,10 +1160,9 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.h, 1
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v0, v1
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.h
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_mul_v3i16:
@@ -1179,12 +1172,10 @@ define i16 @test_vector_reduce_mul_v3i16(<3 x i16> %v) {
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-FAKE16-NEXT:    v_perm_b32 v1, 1, v1, 0x5040100
-; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX12-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX12-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v1
+; GFX12-SDAG-FAKE16-NEXT:    v_mul_lo_u16 v0, v0, v2
 ; GFX12-SDAG-FAKE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-GISEL-LABEL: test_vector_reduce_mul_v3i16:
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index bb47df59eefad..1d6c3f7c5c6e8 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -8,8 +8,8 @@ define i32 @mul_4xi8_zc_exceed(<4 x i8> %a, i32 %c) {
 ; CHECK-LABEL: mul_4xi8_zc_exceed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
-; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,128,0]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,128,2,128]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
@@ -180,8 +180,8 @@ define i32 @mul_4xi8_cs_exceed(<4 x i8> %a, i32 %c) {
 ; CHECK-LABEL: mul_4xi8_cs_exceed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    vpmovsxbd %xmm0, %xmm0
-; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [0,0,1,0,2,0,256,0]
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; CHECK-NEXT:    vpmaddwd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,256,2,256]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/srem-vector-lkk.ll b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
index f9de4e18857c9..678515c3e572e 100644
--- a/llvm/test/CodeGen/X86/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/X86/srem-vector-lkk.ll
@@ -355,13 +355,13 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; SSE4-NEXT:    movq {{.*#+}} xmm2 = [0,32767,45591,12375,0,0,0,0]
 ; SSE4-NEXT:    pmulhw %xmm0, %xmm2
 ; SSE4-NEXT:    paddw %xmm1, %xmm2
-; SSE4-NEXT:    movdqa %xmm2, %xmm3
-; SSE4-NEXT:    psrlw $15, %xmm3
-; SSE4-NEXT:    pxor %xmm4, %xmm4
-; SSE4-NEXT:    pblendw {{.*#+}} xmm4 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7]
+; SSE4-NEXT:    movdqa %xmm2, %xmm1
+; SSE4-NEXT:    psrlw $15, %xmm1
+; SSE4-NEXT:    pxor %xmm3, %xmm3
+; SSE4-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0],xmm1[1,2,3],xmm3[4,5,6,7]
 ; SSE4-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [u,4,4096,64,u,u,u,u]
-; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; SSE4-NEXT:    paddw %xmm4, %xmm2
+; SSE4-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7]
+; SSE4-NEXT:    paddw %xmm3, %xmm2
 ; SSE4-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,32768,23,5423,u,u,u,u]
 ; SSE4-NEXT:    psubw %xmm2, %xmm0
 ; SSE4-NEXT:    retq
@@ -370,13 +370,13 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [1,65535,1,0,u,u,u,u]
 ; AVX1OR2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,32767,45591,12375,u,u,u,u]
-; AVX1OR2-NEXT:    vpaddw %xmm1, %xmm2, %xmm2
-; AVX1OR2-NEXT:    vpsrlw $15, %xmm2, %xmm3
-; AVX1OR2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1,2,3],xmm4[4,5,6,7]
-; AVX1OR2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [u,4,4096,64,u,u,u,u]
-; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
-; AVX1OR2-NEXT:    vpaddw %xmm3, %xmm1, %xmm1
+; AVX1OR2-NEXT:    vpaddw %xmm1, %xmm2, %xmm1
+; AVX1OR2-NEXT:    vpsrlw $15, %xmm1, %xmm2
+; AVX1OR2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0],xmm2[1,2,3],xmm3[4,5,6,7]
+; AVX1OR2-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [u,4,4096,64,u,u,u,u]
+; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
+; AVX1OR2-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
 ; AVX1OR2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,32768,23,5423,u,u,u,u]
 ; AVX1OR2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    retq

From 7206901d53c4460f8fb2aa5e83fcbf7ec443b231 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Thu, 14 May 2026 07:19:57 -0700
Subject: [PATCH 86/95] [libc] Include correct headers in type_traits (#197691)

Otherwise we end up with errors like the following when building with
bazel:
```c++
In file included from external/+_repo_rules+llvm-project/libc/src/__support/CPP/type_traits/is_move_constructible.h:12:
external/+_repo_rules+llvm-project/libc/src/__support/CPP/type_traits/is_constructible.h:32:14: error: no template named 'bool_constant'
   32 |     : public bool_constant<__is_constructible(T, Args...)> {};
```
---
 libc/src/__support/CPP/type_traits/is_assignable.h    | 2 +-
 libc/src/__support/CPP/type_traits/is_constructible.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/CPP/type_traits/is_assignable.h b/libc/src/__support/CPP/type_traits/is_assignable.h
index 0be3aa500590d..5978d358798c3 100644
--- a/libc/src/__support/CPP/type_traits/is_assignable.h
+++ b/libc/src/__support/CPP/type_traits/is_assignable.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_ASSIGNABLE_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_ASSIGNABLE_H
 
-#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/type_traits/bool_constant.h"
 #include "src/__support/CPP/utility/declval.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"
diff --git a/libc/src/__support/CPP/type_traits/is_constructible.h b/libc/src/__support/CPP/type_traits/is_constructible.h
index 316ba1acba33e..c3ab8afb5ca30 100644
--- a/libc/src/__support/CPP/type_traits/is_constructible.h
+++ b/libc/src/__support/CPP/type_traits/is_constructible.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTRUCTIBLE_H
 #define LLVM_LIBC_SRC___SUPPORT_CPP_TYPE_TRAITS_IS_CONSTRUCTIBLE_H
 
-#include "src/__support/CPP/type_traits/integral_constant.h"
+#include "src/__support/CPP/type_traits/bool_constant.h"
 #include "src/__support/CPP/utility/declval.h"
 #include "src/__support/macros/attributes.h"
 #include "src/__support/macros/config.h"

From f2a9f41f701b34dd1cfa568d5e0f48ec9a9c31dd Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora@amd.com>
Date: Thu, 14 May 2026 16:27:57 +0200
Subject: [PATCH 87/95] [AMDGPU][NFC] Remove redundant hasMadU64U32NoCarry
 helper (#197682)

Use hasMadNC64_32Insts() (backed by SubtargetFeature) for MAD 64_32
no-carry and drop the old helper.
---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp        | 4 ++--
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 2 +-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h                | 4 ----
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index c2322bd922f31..ecf8d957fc80f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1232,7 +1232,7 @@ void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
   SDLoc SL(N);
   bool Signed = N->getOpcode() == AMDGPUISD::MAD_I64_I32;
   unsigned Opc;
-  bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() && !N->hasAnyUseOfValue(1);
+  bool UseNoCarry = Subtarget->hasMadNC64_32Insts() && !N->hasAnyUseOfValue(1);
   if (Subtarget->hasMADIntraFwdBug())
     Opc = Signed ? AMDGPU::V_MAD_I64_I32_gfx11_e64
                  : AMDGPU::V_MAD_U64_U32_gfx11_e64;
@@ -1262,7 +1262,7 @@ void AMDGPUDAGToDAGISel::SelectMUL_LOHI(SDNode *N) {
   bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
   SDVTList VTList;
   unsigned Opc;
-  if (Subtarget->hasMadU64U32NoCarry()) {
+  if (Subtarget->hasMadNC64_32Insts()) {
     VTList = CurDAG->getVTList(MVT::i64);
     Opc = Signed ? AMDGPU::V_MAD_NC_I64_I32_e64 : AMDGPU::V_MAD_NC_U64_U32_e64;
   } else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9297c42754d17..4bfd56c0c4007 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -594,7 +594,7 @@ bool AMDGPUInstructionSelector::selectG_AMDGPU_MAD_64_32(
   MachineBasicBlock *BB = I.getParent();
   MachineFunction *MF = BB->getParent();
   const bool IsUnsigned = I.getOpcode() == AMDGPU::G_AMDGPU_MAD_U64_U32;
-  bool UseNoCarry = Subtarget->hasMadU64U32NoCarry() &&
+  bool UseNoCarry = Subtarget->hasMadNC64_32Insts() &&
                     MRI->use_nodbg_empty(I.getOperand(1).getReg());
 
   unsigned Opc;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 5f580ac0577d5..ddc30a6cf11cd 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -698,10 +698,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
   bool hasVectorMulU64() const { return HasGFX1250Insts; }
 
-  // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
-  // instructions.
-  bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
-
   // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
   bool hasIntMinMax64() const { return HasGFX1250Insts; }
 

From 0f79ba29f371a1acc0b592fdaf58a58a77f0496a Mon Sep 17 00:00:00 2001
From: Dmitry Sidorov <Dmitry.Sidorov@amd.com>
Date: Thu, 14 May 2026 16:28:27 +0200
Subject: [PATCH 88/95] Adjust SPV_AMD_weak_linkage (#197484)

Linkage was renamed + a capability added following review in
https://github.com/KhronosGroup/SPIRV-Registry/pull/401
---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp   |  4 +++-
 llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td  |  3 ++-
 llvm/lib/Target/SPIRV/SPIRVUtils.cpp            |  2 +-
 llvm/test/CodeGen/SPIRV/linkage/weak-linkage.ll | 11 ++++++-----
 4 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index c3c07f2efd6dd..5198c0bf5c992 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -1029,8 +1029,10 @@ static void addOpDecorateReqs(const MachineInstr &MI, unsigned DecIndex,
         static_cast<SPIRV::LinkageType::LinkageType>(LinkageOp);
     if (LnkType == SPIRV::LinkageType::LinkOnceODR)
       Reqs.addExtension(SPIRV::Extension::SPV_KHR_linkonce_odr);
-    else if (LnkType == SPIRV::LinkageType::Weak)
+    else if (LnkType == SPIRV::LinkageType::WeakAMD) {
       Reqs.addExtension(SPIRV::Extension::SPV_AMD_weak_linkage);
+      Reqs.addCapability(SPIRV::Capability::WeakLinkageAMD);
+    }
   } else if (Dec == SPIRV::Decoration::CacheControlLoadINTEL ||
              Dec == SPIRV::Decoration::CacheControlStoreINTEL) {
     Reqs.addExtension(SPIRV::Extension::SPV_INTEL_cache_controls);
diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
index 82932a6a52385..e80e70e423e01 100644
--- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
+++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td
@@ -635,6 +635,7 @@ defm TensorFloat32RoundingINTEL : CapabilityOperand<6425, 0, 0, [SPV_INTEL_tenso
 defm MaskedGatherScatterINTEL : CapabilityOperand<6427, 0, 0, [SPV_INTEL_masked_gather_scatter], []>;
 defm BFloat16DotProductKHR : CapabilityOperand<5117, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR]>;
 defm BFloat16CooperativeMatrixKHR : CapabilityOperand<5118, 0, 0, [SPV_KHR_bfloat16], [BFloat16TypeKHR, CooperativeMatrixKHR]>;
+defm WeakLinkageAMD : CapabilityOperand<5181, 0, 0, [SPV_AMD_weak_linkage], [Linkage]>;
 defm BlockingPipesALTERA : CapabilityOperand<5945, 0, 0, [SPV_ALTERA_blocking_pipes], []>;
 defm ArbitraryPrecisionFixedPointALTERA : CapabilityOperand<5922, 0, 0, [SPV_ALTERA_arbitrary_precision_fixed_point], []>;
 defm ArbitraryPrecisionFloatingPointALTERA : CapabilityOperand<5845, 0, 0,[SPV_ALTERA_arbitrary_precision_floating_point], []>;
@@ -1264,7 +1265,7 @@ multiclass LinkageTypeOperand<bits<32> value, list<Capability> reqCapabilities>
 defm Export : LinkageTypeOperand<0, [Linkage]>;
 defm Import : LinkageTypeOperand<1, [Linkage]>;
 defm LinkOnceODR : LinkageTypeOperand<2, [Linkage]>;
-defm Weak : LinkageTypeOperand<3, [Linkage]>;
+defm WeakAMD : LinkageTypeOperand<3, [WeakLinkageAMD]>;
 
 //===----------------------------------------------------------------------===//
 // Multiclass used to define AccessQualifier enum values and at the same time
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
index 6f8411f05cb52..0ef31f4182b4e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.cpp
@@ -1238,7 +1238,7 @@ getSpirvLinkageTypeFor(const SPIRVSubtarget &ST, const GlobalValue &GV) {
 
   if (GV.hasWeakLinkage() &&
       ST.canUseExtension(SPIRV::Extension::SPV_AMD_weak_linkage))
-    return SPIRV::LinkageType::Weak;
+    return SPIRV::LinkageType::WeakAMD;
 
   return SPIRV::LinkageType::Export;
 }
diff --git a/llvm/test/CodeGen/SPIRV/linkage/weak-linkage.ll b/llvm/test/CodeGen/SPIRV/linkage/weak-linkage.ll
index 338484e052be6..208985613b558 100644
--- a/llvm/test/CodeGen/SPIRV/linkage/weak-linkage.ll
+++ b/llvm/test/CodeGen/SPIRV/linkage/weak-linkage.ll
@@ -5,14 +5,15 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
 ; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
-; CHECK-SPIRV-EXT: Capability Linkage
+; CHECK-SPIRV-EXT-DAG: Capability Linkage
+; CHECK-SPIRV-EXT-DAG: Capability WeakLinkageAMD
 ; CHECK-SPIRV-EXT: Extension "SPV_AMD_weak_linkage"
-; CHECK-SPIRV-EXT-DAG: OpDecorate %[[#]] LinkageAttributes "GV" Weak
-; CHECK-SPIRV-EXT-DAG: OpDecorate %[[#]] LinkageAttributes "square" Weak
+; CHECK-SPIRV-EXT-DAG: OpDecorate %[[#]] LinkageAttributes "GV" WeakAMD
+; CHECK-SPIRV-EXT-DAG: OpDecorate %[[#]] LinkageAttributes "square" WeakAMD
 
 ; CHECK-SPIRV-NOT: OpExtension "SPV_AMD_weak_linkage"
-; CHECK-SPIRV-NOT: OpDecorate %[[#]] LinkageAttributes "GV" Weak
-; CHECK-SPIRV-NOT: OpDecorate %[[#]] LinkageAttributes "square" Weak
+; CHECK-SPIRV-NOT: OpDecorate %[[#]] LinkageAttributes "GV" WeakAMD
+; CHECK-SPIRV-NOT: OpDecorate %[[#]] LinkageAttributes "square" WeakAMD
 ; CHECK-SPIRV-DAG: OpDecorate %[[#]] LinkageAttributes "GV" Export
 ; CHECK-SPIRV-DAG: OpDecorate %[[#]] LinkageAttributes "square" Export
 

From 692b8fd0a434fb6b71c49772487e62d6e2d2864f Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Thu, 14 May 2026 16:34:51 +0200
Subject: [PATCH 89/95] [libc++] Replace ranges::find_first_of with
 std::find_first_of in __try_constant_folding (#197641)

This reduces the time it takes to instantiate `std::format` from ~160ms
to ~120ms in my testing.
---
 libcxx/include/__format/format_functions.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/libcxx/include/__format/format_functions.h b/libcxx/include/__format/format_functions.h
index 7cf259d0e1db7..b58e487d7511e 100644
--- a/libcxx/include/__format/format_functions.h
+++ b/libcxx/include/__format/format_functions.h
@@ -11,7 +11,7 @@
 #define _LIBCPP___FORMAT_FORMAT_FUNCTIONS
 
 #include <__algorithm/clamp.h>
-#include <__algorithm/ranges_find_first_of.h>
+#include <__algorithm/find_first_of.h>
 #include <__chrono/statically_widen.h>
 #include <__concepts/convertible_to.h>
 #include <__concepts/same_as.h>
@@ -459,8 +459,12 @@ template <class _CharT>
     basic_string_view<_CharT> __fmt,
     basic_format_args<basic_format_context<back_insert_iterator<__format::__output_buffer<_CharT>>, _CharT>> __args) {
   // Fold strings not containing '{' or '}' to just return the string
-  if (bool __is_identity = [&] [[__gnu__::__pure__]] // Make sure the compiler knows this call can be eliminated
-      { return std::ranges::find_first_of(__fmt, array{'{', '}'}) == __fmt.end(); }();
+  if (bool __is_identity =
+          [&] [[__gnu__::__pure__]] // Make sure the compiler knows this call can be eliminated
+      {
+        char __vals[] = {'{', '}'};
+        return std::find_first_of(__fmt.begin(), __fmt.end(), std::begin(__vals), std::end(__vals)) == __fmt.end();
+      }();
       __builtin_constant_p(__is_identity) && __is_identity)
     return basic_string<_CharT>{__fmt};
 

From 0ac83dccaf53f3a51714fd53b151314de1a13e48 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@arm.com>
Date: Thu, 14 May 2026 15:38:43 +0100
Subject: [PATCH 90/95] [clang][AArch64] Use structured bindings in feature
 parsing code (#197689)

Clearer than having to know that first is a CPU and second is the
feature list.
---
 clang/lib/Driver/ToolChains/Arch/AArch64.cpp | 21 ++++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index 7ed4002e53420..12f7e382cc99a 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -180,8 +180,7 @@ static bool DecodeAArch64HostFeatures(llvm::AArch64::ExtensionSet &Extensions) {
 static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
                               llvm::AArch64::ExtensionSet &Extensions,
                               std::optional<std::string> &InvalidArg) {
-  std::pair<StringRef, StringRef> Split = Mcpu.split("+");
-  StringRef CPU = Split.first;
+  auto [CPU, features] = Mcpu.split("+");
   const bool IsNative = CPU == "native";
 
   if (IsNative)
@@ -190,7 +189,7 @@ static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
   const std::optional<llvm::AArch64::CpuInfo> CpuInfo =
       llvm::AArch64::parseCpu(CPU);
   if (!CpuInfo) {
-    InvalidArg.emplace(Split.first.str());
+    InvalidArg.emplace(CPU.str());
     return false;
   }
 
@@ -199,8 +198,8 @@ static bool DecodeAArch64Mcpu(const Driver &D, StringRef Mcpu,
   if (IsNative && !DecodeAArch64HostFeatures(Extensions))
     return false;
 
-  if (Split.second.size() &&
-      !DecodeAArch64Features(D, Split.second, Extensions, InvalidArg))
+  if (features.size() &&
+      !DecodeAArch64Features(D, features, Extensions, InvalidArg))
     return false;
 
   return true;
@@ -212,22 +211,22 @@ getAArch64ArchFeaturesFromMarch(const Driver &D, StringRef March,
                                 llvm::AArch64::ExtensionSet &Extensions,
                                 std::optional<std::string> &InvalidArg) {
   std::string MarchLowerCase = March.lower();
-  std::pair<StringRef, StringRef> Split = StringRef(MarchLowerCase).split("+");
+  auto [CPU, features] = StringRef(MarchLowerCase).split("+");
 
-  if (Split.first == "native")
+  if (CPU == "native")
     return DecodeAArch64Mcpu(D, MarchLowerCase, Extensions, InvalidArg);
 
   const llvm::AArch64::ArchInfo *ArchInfo =
-      llvm::AArch64::parseArch(Split.first);
+      llvm::AArch64::parseArch(CPU);
   if (!ArchInfo) {
-    InvalidArg.emplace(Split.first.str());
+    InvalidArg.emplace(CPU.str());
     return false;
   }
 
   Extensions.addArchDefaults(*ArchInfo);
 
-  if ((Split.second.size() &&
-       !DecodeAArch64Features(D, Split.second, Extensions, InvalidArg)))
+  if ((features.size() &&
+       !DecodeAArch64Features(D, features, Extensions, InvalidArg)))
     return false;
 
   return true;

From 31ec3d81f4597c9dd4c4e377af8186a3ccf13750 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Thu, 14 May 2026 15:42:19 +0100
Subject: [PATCH 91/95] [ARM][GlobalISel] Remove dependency on legal ruleset.
 (#197370)

This fills in always legal rules, to remove the dependency on the legacy
ruleset. The trunc rule might make some differences but it looks like
i64 zext / sext are not well supported at the moment. This is not
guaranteed to be all the rules, just the ones that appear in tests.
---
 llvm/lib/Target/ARM/ARMLegalizerInfo.cpp      |  4 +++
 .../ARM/GlobalISel/arm-regbankselect.mir      | 25 ---------------
 .../test/CodeGen/ARM/GlobalISel/select-fp.mir | 32 -------------------
 3 files changed, 4 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index 8b334fe84be45..f7d03119d9b93 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -51,6 +51,9 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) : ST(ST) {
   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
       .legalForCartesianProduct({s8, s16, s32}, {s1, s8, s16});
 
+  getActionDefinitionsBuilder(G_TRUNC).legalForCartesianProduct({s1, s8, s16},
+                                                                {s8, s16, s32});
+
   getActionDefinitionsBuilder(G_SEXT_INREG).lower();
 
   getActionDefinitionsBuilder({G_MUL, G_AND, G_OR, G_XOR})
@@ -133,6 +136,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) : ST(ST) {
       .legalFor({{p0, s32}})
       .minScalar(1, s32);
 
+  getActionDefinitionsBuilder(G_BR).alwaysLegal();
   getActionDefinitionsBuilder(G_BRCOND).legalFor({s1});
 
   if (!ST.useSoftFloat() && ST.hasVFP2Base()) {
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/llvm/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 87d1785809b90..7b530cef67ebc 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -37,7 +37,6 @@
   define void @test_anyext_s16_32() { ret void }
 
   define void @test_trunc_s32_16() { ret void }
-  define void @test_trunc_s64_32() #0 { ret void }
 
   define void @test_icmp_eq_s32() { ret void }
   define void @test_fcmp_one_s32() #0 { ret void }
@@ -677,30 +676,6 @@ body:             |
     BX_RET 14, $noreg
 ...
 ---
-name:            test_trunc_s64_32
-# CHECK-LABEL: name: test_trunc_s64_32
-legalized:       true
-regBankSelected: false
-selected:        false
-# CHECK: registers:
-# CHECK: - { id: 0, class: fprb, preferred-register: '', flags: [  ] }
-# CHECK: - { id: 1, class: gprb, preferred-register: '', flags: [  ] }
-# CHECK: - { id: 2, class: gprb, preferred-register: '', flags: [  ] }
-registers:
-  - { id: 0, class: _ }
-  - { id: 1, class: _ }
-  - { id: 2, class: _ }
-body:             |
-  bb.0:
-    liveins: $r0, $d0
-
-    %0(s64) = COPY $d0
-    %2(p0) = COPY $r0
-    %1(s32) = G_TRUNC %0(s64)
-    G_STORE %1(s32), %2 :: (store (s32))
-    BX_RET 14, $noreg
-...
----
 name:            test_icmp_eq_s32
 # CHECK-LABEL: name: test_icmp_eq_s32
 legalized:       true
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/select-fp.mir b/llvm/test/CodeGen/ARM/GlobalISel/select-fp.mir
index 4517fe6dd4f13..cc49e2c57f7f1 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/select-fp.mir
+++ b/llvm/test/CodeGen/ARM/GlobalISel/select-fp.mir
@@ -2,8 +2,6 @@
 # RUN: llc -O0 -mtriple arm-- -mattr=+vfp4,-neonfp -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
 # RUN: llc -O0 -mtriple thumb-- -mattr=+v6t2,+vfp4,-neonfp -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
 --- |
-  define void @test_trunc_s64() { ret void }
-
   define void @test_fadd_s32() { ret void }
   define void @test_fadd_s64() { ret void }
 
@@ -46,36 +44,6 @@
 
 ...
 ---
-name:            test_trunc_s64
-# CHECK-LABEL: name: test_trunc_s64
-legalized:       true
-regBankSelected: true
-selected:        false
-# CHECK: selected: true
-registers:
-  - { id: 0, class: fprb }
-  - { id: 1, class: gprb }
-  - { id: 2, class: gprb }
-body:             |
-  bb.0:
-    liveins: $r0, $d0
-
-    %0(s64) = COPY $d0
-    ; CHECK: [[VREG:%[0-9]+]]:dpr = COPY $d0
-
-    %2(p0) = COPY $r0
-    ; CHECK: [[PTR:%[0-9]+]]:gpr = COPY $r0
-
-    %1(s32) = G_TRUNC %0(s64)
-    ; CHECK: [[VREGTRUNC:%[0-9]+]]:gpr, [[UNINTERESTING:%[0-9]+]]:gpr = VMOVRRD [[VREG]]
-
-    G_STORE %1(s32), %2 :: (store (s32))
-    ; CHECK: STRi12 [[VREGTRUNC]], [[PTR]], 0, 14 /* CC::al */, $noreg
-
-    BX_RET 14, $noreg
-    ; CHECK: BX_RET 14 /* CC::al */, $noreg
-...
----
 name:            test_fadd_s32
 legalized:       true
 regBankSelected: true

From 277372b7d05b5eb026fb6e804af4213f3691c047 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20Benics?= <benicsbalazs@gmail.com>
Date: Thu, 14 May 2026 15:42:47 +0100
Subject: [PATCH 92/95] [docs] Add "LLVM Memory Safety" and "Lifetime Safety"
 working Groups (#197692)

---
 llvm/docs/GettingInvolved.rst | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/docs/GettingInvolved.rst b/llvm/docs/GettingInvolved.rst
index 9d6efc7673edf..aec6c87a1424e 100644
--- a/llvm/docs/GettingInvolved.rst
+++ b/llvm/docs/GettingInvolved.rst
@@ -236,7 +236,16 @@ what to add to your calendar invite.
      - `ics <https://calendar.google.com/calendar/ical/9c23f3a54dbb4fbac3801c50094fc43118a37c186f5c65b2898cd0fc251c8610%40group.calendar.google.com/public/basic.ics>`__
        `gcal <https://calendar.google.com/calendar/u/0?cid=OWMyM2YzYTU0ZGJiNGZiYWMzODAxYzUwMDk0ZmM0MzExOGEzN2MxODZmNWM2NWIyODk4Y2QwZmMyNTFjODYxMEBncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
      - `Meeting notes <https://docs.google.com/document/d/1ijI8pWeyidmhFOd5Ndgvr5AziZwrMCbt2oUehv8qHmw/edit?usp=sharing>`__
-
+   * - LLVM Memory Safety Working Group
+     - Every 4 weeks on Thursdays
+     - `ics <https://calendar.google.com/calendar/ical/2d77f9a2624d18cd46e5299d15cc0fa0c90dca53fd68802261d52121d21a0573%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0?cid=MmQ3N2Y5YTI2MjRkMThjZDQ2ZTUyOTlkMTVjYzBmYTBjOTBkY2E1M2ZkNjg4MDIyNjFkNTIxMjFkMjFhMDU3M0Bncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting notes <https://docs.google.com/document/d/1DkCik6BTnO-cox_9y_BTKzPaJJOo_hBxiNFP3lInvOM/edit?usp=sharing>`__
+   * - `Lifetime Safety Breakout Group <https://github.com/orgs/llvm/projects/39>`__
+     - Every 2 weeks on Wednesdays
+     - `ics <https://calendar.google.com/calendar/ical/2d77f9a2624d18cd46e5299d15cc0fa0c90dca53fd68802261d52121d21a0573%40group.calendar.google.com/public/basic.ics>`__
+       `gcal <https://calendar.google.com/calendar/u/0?cid=MmQ3N2Y5YTI2MjRkMThjZDQ2ZTUyOTlkMTVjYzBmYTBjOTBkY2E1M2ZkNjg4MDIyNjFkNTIxMjFkMjFhMDU3M0Bncm91cC5jYWxlbmRhci5nb29nbGUuY29t>`__
+     - `Meeting notes <https://docs.google.com/document/d/1DkCik6BTnO-cox_9y_BTKzPaJJOo_hBxiNFP3lInvOM/edit?tab=t.nvvd6cfloi81>`__
 
 For event owners, our Discord bot also supports sending automated announcements
 of upcoming sync-ups. Please see the :ref:`discord-bot-event-pings` section for

From 00559c249a8c6aa4d0746839da312ced5031dce6 Mon Sep 17 00:00:00 2001
From: Charles Zablit <c_zablit@apple.com>
Date: Thu, 14 May 2026 16:54:15 +0200
Subject: [PATCH 93/95] [lldb][windows] Keep int3 breakpoints inside the
 debugger on lldb-server (#197669)

---
 .../Windows/Common/NativeProcessWindows.cpp       | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp b/lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp
index c5698b1a8368d..4fdb286c93e1e 100644
--- a/lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp
+++ b/lldb/source/Plugins/Process/Windows/Common/NativeProcessWindows.cpp
@@ -564,7 +564,20 @@ NativeProcessWindows::OnDebugException(bool first_chance,
       return ExceptionResult::BreakInDebugger;
     }
 
-    [[fallthrough]];
+    {
+      // Any remaining STATUS_BREAKPOINT is a breakpoint instruction in the
+      // program's own code (e.g. `__debugbreak()` or `__builtin_debugtrap()`).
+      // Stop the debugger and let the user decide what to do.
+      std::string desc =
+          formatv("Exception {0:x8} encountered at address {1:x8}",
+                  record.GetExceptionCode(), record.GetExceptionAddress())
+              .str();
+      StopThread(record.GetThreadID(), StopReason::eStopReasonException,
+                 std::move(desc));
+      SetState(eStateStopped, true);
+    }
+
+    return ExceptionResult::MaskException;
   default:
     LLDB_LOG(log,
              "Debugger thread reported exception {0:x} at address {1:x} "

From 721db09983c8dc56272d55bced3fa7ed61eb30bf Mon Sep 17 00:00:00 2001
From: Matheus Izvekov <mizvekov@gmail.com>
Date: Thu, 14 May 2026 12:07:40 -0300
Subject: [PATCH 94/95] [clang] NFC: add asserts and fixes for enforcing
 template parameters have valid positions (#197598)

Some tests are violating these assertions, so they are commented out.

For the test in `clang/test/SemaTemplate/concepts.cpp`, that was broken
by #195995 and needs a partial revert at least.
---
 clang/include/clang/AST/ASTContext.h        |  3 +-
 clang/include/clang/AST/DeclTemplate.h      | 59 +++++++++------------
 clang/include/clang/AST/TypeBase.h          |  9 +++-
 clang/lib/AST/ASTContext.cpp                |  9 ++--
 clang/lib/AST/DeclTemplate.cpp              | 29 +++++-----
 clang/lib/Sema/SemaTemplateDeduction.cpp    |  4 +-
 clang/test/SemaTemplate/concepts-lambda.cpp |  7 ++-
 clang/test/SemaTemplate/concepts.cpp        |  4 +-
 8 files changed, 64 insertions(+), 60 deletions(-)

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index c45d54fdd2e88..b2fd522e6865c 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1998,8 +1998,7 @@ class ASTContext : public RefCountedBase<ASTContext> {
   QualType getSubstBuiltinTemplatePack(const TemplateArgument &ArgPack);
 
   QualType
-  getTemplateTypeParmType(unsigned Depth, unsigned Index,
-                          bool ParameterPack,
+  getTemplateTypeParmType(int Depth, int Index, bool ParameterPack,
                           TemplateTypeParmDecl *ParmDecl = nullptr) const;
 
   QualType getCanonicalTemplateSpecializationType(
diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 9fb41c87da732..914bfb529302e 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -1128,16 +1128,9 @@ class TemplateParmPosition {
   unsigned Depth : DepthWidth;
   unsigned Position : PositionWidth;
 
-  static constexpr unsigned MaxDepth = (1U << DepthWidth) - 1;
-  static constexpr unsigned MaxPosition = (1U << PositionWidth) - 1;
-
-  TemplateParmPosition(unsigned D, unsigned P) : Depth(D), Position(P) {
-    // The input may fill maximum values to show that it is invalid.
-    // Add one here to convert it to zero.
-    assert((D + 1) <= MaxDepth &&
-           "The depth of template parmeter position is more than 2^20!");
-    assert((P + 1) <= MaxPosition &&
-           "The position of template parmeter position is more than 2^12!");
+  TemplateParmPosition(int D, int P) {
+    setDepth(D);
+    setPosition(P);
   }
 
 public:
@@ -1145,17 +1138,17 @@ class TemplateParmPosition {
 
   /// Get the nesting depth of the template parameter.
   unsigned getDepth() const { return Depth; }
-  void setDepth(unsigned D) {
-    assert((D + 1) <= MaxDepth &&
-           "The depth of template parmeter position is more than 2^20!");
+  void setDepth(int D) {
+    assert(D >= 0 && "The depth cannot be negative");
+    assert(D < (1 << DepthWidth) && "The depth is too large");
     Depth = D;
   }
 
   /// Get the position of the template parameter within its parameter list.
   unsigned getPosition() const { return Position; }
-  void setPosition(unsigned P) {
-    assert((P + 1) <= MaxPosition &&
-           "The position of template parmeter position is more than 2^12!");
+  void setPosition(int P) {
+    assert(P >= 0 && "The position cannot be negative");
+    assert(P < (1 << PositionWidth) && "The position is too large");
     Position = P;
   }
 
@@ -1208,7 +1201,7 @@ class TemplateTypeParmDecl final : public TypeDecl,
 public:
   static TemplateTypeParmDecl *
   Create(const ASTContext &C, DeclContext *DC, SourceLocation KeyLoc,
-         SourceLocation NameLoc, unsigned D, unsigned P, IdentifierInfo *Id,
+         SourceLocation NameLoc, int D, int P, IdentifierInfo *Id,
          bool Typename, bool ParameterPack, bool HasTypeConstraint = false,
          UnsignedOrNone NumExpanded = std::nullopt);
   static TemplateTypeParmDecl *CreateDeserialized(const ASTContext &C,
@@ -1389,14 +1382,14 @@ class NonTypeTemplateParmDecl final
   }
 
   NonTypeTemplateParmDecl(DeclContext *DC, SourceLocation StartLoc,
-                          SourceLocation IdLoc, unsigned D, unsigned P,
+                          SourceLocation IdLoc, int D, int P,
                           const IdentifierInfo *Id, QualType T,
                           bool ParameterPack, TypeSourceInfo *TInfo)
       : DeclaratorDecl(NonTypeTemplateParm, DC, IdLoc, Id, T, TInfo, StartLoc),
         TemplateParmPosition(D, P), ParameterPack(ParameterPack) {}
 
   NonTypeTemplateParmDecl(DeclContext *DC, SourceLocation StartLoc,
-                          SourceLocation IdLoc, unsigned D, unsigned P,
+                          SourceLocation IdLoc, int D, int P,
                           const IdentifierInfo *Id, QualType T,
                           TypeSourceInfo *TInfo,
                           ArrayRef<QualType> ExpandedTypes,
@@ -1405,12 +1398,12 @@ class NonTypeTemplateParmDecl final
 public:
   static NonTypeTemplateParmDecl *
   Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
-         SourceLocation IdLoc, unsigned D, unsigned P, const IdentifierInfo *Id,
+         SourceLocation IdLoc, int D, int P, const IdentifierInfo *Id,
          QualType T, bool ParameterPack, TypeSourceInfo *TInfo);
 
   static NonTypeTemplateParmDecl *
   Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
-         SourceLocation IdLoc, unsigned D, unsigned P, const IdentifierInfo *Id,
+         SourceLocation IdLoc, int D, int P, const IdentifierInfo *Id,
          QualType T, TypeSourceInfo *TInfo, ArrayRef<QualType> ExpandedTypes,
          ArrayRef<TypeSourceInfo *> ExpandedTInfos);
 
@@ -1609,8 +1602,8 @@ class TemplateTemplateParmDecl final
   /// The number of parameters in an expanded parameter pack.
   unsigned NumExpandedParams = 0;
 
-  TemplateTemplateParmDecl(DeclContext *DC, SourceLocation L, unsigned D,
-                           unsigned P, bool ParameterPack, IdentifierInfo *Id,
+  TemplateTemplateParmDecl(DeclContext *DC, SourceLocation L, int D, int P,
+                           bool ParameterPack, IdentifierInfo *Id,
                            TemplateNameKind ParameterKind, bool Typename,
                            TemplateParameterList *Params)
       : TemplateDecl(TemplateTemplateParm, DC, L, Id, Params),
@@ -1618,10 +1611,9 @@ class TemplateTemplateParmDecl final
         Typename(Typename), ParameterPack(ParameterPack),
         ExpandedParameterPack(false) {}
 
-  TemplateTemplateParmDecl(DeclContext *DC, SourceLocation L, unsigned D,
-                           unsigned P, IdentifierInfo *Id,
-                           TemplateNameKind ParameterKind, bool Typename,
-                           TemplateParameterList *Params,
+  TemplateTemplateParmDecl(DeclContext *DC, SourceLocation L, int D, int P,
+                           IdentifierInfo *Id, TemplateNameKind ParameterKind,
+                           bool Typename, TemplateParameterList *Params,
                            ArrayRef<TemplateParameterList *> Expansions);
 
   void anchor() override;
@@ -1632,15 +1624,14 @@ class TemplateTemplateParmDecl final
   friend TrailingObjects;
 
   static TemplateTemplateParmDecl *
-  Create(const ASTContext &C, DeclContext *DC, SourceLocation L, unsigned D,
-         unsigned P, bool ParameterPack, IdentifierInfo *Id,
-         TemplateNameKind ParameterKind, bool Typename,
-         TemplateParameterList *Params);
+  Create(const ASTContext &C, DeclContext *DC, SourceLocation L, int D, int P,
+         bool ParameterPack, IdentifierInfo *Id, TemplateNameKind ParameterKind,
+         bool Typename, TemplateParameterList *Params);
 
   static TemplateTemplateParmDecl *
-  Create(const ASTContext &C, DeclContext *DC, SourceLocation L, unsigned D,
-         unsigned P, IdentifierInfo *Id, TemplateNameKind ParameterKind,
-         bool Typename, TemplateParameterList *Params,
+  Create(const ASTContext &C, DeclContext *DC, SourceLocation L, int D, int P,
+         IdentifierInfo *Id, TemplateNameKind ParameterKind, bool Typename,
+         TemplateParameterList *Params,
          ArrayRef<TemplateParameterList *> Expansions);
 
   static TemplateTemplateParmDecl *CreateDeserialized(ASTContext &C,
diff --git a/clang/include/clang/AST/TypeBase.h b/clang/include/clang/AST/TypeBase.h
index e90aa9fb09012..c64eee11fd91e 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -2226,6 +2226,9 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned hasTypeDifferentFromDecl : 1;
   };
 
+  static constexpr unsigned TemplateTypeParmTypeDepthBits = 15;
+  static constexpr unsigned TemplateTypeParmTypeIndexBits = 16;
+
   class TemplateTypeParmTypeBitfields {
     friend class TemplateTypeParmType;
 
@@ -2233,14 +2236,14 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
     unsigned : NumTypeBits;
 
     /// The depth of the template parameter.
-    unsigned Depth : 15;
+    unsigned Depth : TemplateTypeParmTypeDepthBits;
 
     /// Whether this is a template parameter pack.
     LLVM_PREFERRED_TYPE(bool)
     unsigned ParameterPack : 1;
 
     /// The index of the template parameter.
-    unsigned Index : 16;
+    unsigned Index : TemplateTypeParmTypeIndexBits;
   };
 
   class SubstTemplateTypeParmTypeBitfields {
@@ -7062,6 +7065,8 @@ class TemplateTypeParmType : public Type, public llvm::FoldingSetNode {
                  (PP ? TypeDependence::UnexpandedPack : TypeDependence::None)),
         TTPDecl(TTPDecl) {
     assert(!TTPDecl == Canon.isNull());
+    assert(D < (1 << TemplateTypeParmTypeDepthBits) && "Depth too large");
+    assert(I < (1 << TemplateTypeParmTypeIndexBits) && "Index too large");
     TemplateTypeParmTypeBits.Depth = D;
     TemplateTypeParmTypeBits.Index = I;
     TemplateTypeParmTypeBits.ParameterPack = PP;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a0894318dbd53..bc4771aec77d1 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -5952,9 +5952,12 @@ ASTContext::getSubstBuiltinTemplatePack(const TemplateArgument &ArgPack) {
 /// Retrieve the template type parameter type for a template
 /// parameter or parameter pack with the given depth, index, and (optionally)
 /// name.
-QualType ASTContext::getTemplateTypeParmType(unsigned Depth, unsigned Index,
-                                             bool ParameterPack,
-                                             TemplateTypeParmDecl *TTPDecl) const {
+QualType
+ASTContext::getTemplateTypeParmType(int Depth, int Index, bool ParameterPack,
+                                    TemplateTypeParmDecl *TTPDecl) const {
+  assert(Depth >= 0 && "Depth must be non-negative");
+  assert(Index >= 0 && "Index must be non-negative");
+
   llvm::FoldingSetNodeID ID;
   TemplateTypeParmType::Profile(ID, Depth, Index, ParameterPack, TTPDecl);
   void *InsertPos = nullptr;
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 08e6512a1c74d..a6d7e1473cd5a 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -672,9 +672,8 @@ CanQualType ClassTemplateDecl::getCanonicalInjectedSpecializationType(
 
 TemplateTypeParmDecl *TemplateTypeParmDecl::Create(
     const ASTContext &C, DeclContext *DC, SourceLocation KeyLoc,
-    SourceLocation NameLoc, unsigned D, unsigned P, IdentifierInfo *Id,
-    bool Typename, bool ParameterPack, bool HasTypeConstraint,
-    UnsignedOrNone NumExpanded) {
+    SourceLocation NameLoc, int D, int P, IdentifierInfo *Id, bool Typename,
+    bool ParameterPack, bool HasTypeConstraint, UnsignedOrNone NumExpanded) {
   auto *TTPDecl =
       new (C, DC,
            additionalSizeToAlloc<TypeConstraint>(HasTypeConstraint ? 1 : 0))
@@ -756,8 +755,8 @@ void TemplateTypeParmDecl::setTypeConstraint(
 //===----------------------------------------------------------------------===//
 
 NonTypeTemplateParmDecl::NonTypeTemplateParmDecl(
-    DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, unsigned D,
-    unsigned P, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo,
+    DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, int D,
+    int P, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo,
     ArrayRef<QualType> ExpandedTypes, ArrayRef<TypeSourceInfo *> ExpandedTInfos)
     : DeclaratorDecl(NonTypeTemplateParm, DC, IdLoc, Id, T, TInfo, StartLoc),
       TemplateParmPosition(D, P), ParameterPack(true),
@@ -774,8 +773,8 @@ NonTypeTemplateParmDecl::NonTypeTemplateParmDecl(
 
 NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create(
     const ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
-    SourceLocation IdLoc, unsigned D, unsigned P, const IdentifierInfo *Id,
-    QualType T, bool ParameterPack, TypeSourceInfo *TInfo) {
+    SourceLocation IdLoc, int D, int P, const IdentifierInfo *Id, QualType T,
+    bool ParameterPack, TypeSourceInfo *TInfo) {
   AutoType *AT =
       C.getLangOpts().CPlusPlus20 ? T->getContainedAutoType() : nullptr;
   const bool HasConstraint = AT && AT->isConstrained();
@@ -792,8 +791,8 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create(
 
 NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create(
     const ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
-    SourceLocation IdLoc, unsigned D, unsigned P, const IdentifierInfo *Id,
-    QualType T, TypeSourceInfo *TInfo, ArrayRef<QualType> ExpandedTypes,
+    SourceLocation IdLoc, int D, int P, const IdentifierInfo *Id, QualType T,
+    TypeSourceInfo *TInfo, ArrayRef<QualType> ExpandedTypes,
     ArrayRef<TypeSourceInfo *> ExpandedTInfos) {
   AutoType *AT = TInfo->getType()->getContainedAutoType();
   const bool HasConstraint = AT && AT->isConstrained();
@@ -865,9 +864,9 @@ void NonTypeTemplateParmDecl::setDefaultArgument(
 void TemplateTemplateParmDecl::anchor() {}
 
 TemplateTemplateParmDecl::TemplateTemplateParmDecl(
-    DeclContext *DC, SourceLocation L, unsigned D, unsigned P,
-    IdentifierInfo *Id, TemplateNameKind Kind, bool Typename,
-    TemplateParameterList *Params, ArrayRef<TemplateParameterList *> Expansions)
+    DeclContext *DC, SourceLocation L, int D, int P, IdentifierInfo *Id,
+    TemplateNameKind Kind, bool Typename, TemplateParameterList *Params,
+    ArrayRef<TemplateParameterList *> Expansions)
     : TemplateDecl(TemplateTemplateParm, DC, L, Id, Params),
       TemplateParmPosition(D, P), ParameterKind(Kind), Typename(Typename),
       ParameterPack(true), ExpandedParameterPack(true),
@@ -876,8 +875,8 @@ TemplateTemplateParmDecl::TemplateTemplateParmDecl(
 }
 
 TemplateTemplateParmDecl *TemplateTemplateParmDecl::Create(
-    const ASTContext &C, DeclContext *DC, SourceLocation L, unsigned D,
-    unsigned P, bool ParameterPack, IdentifierInfo *Id, TemplateNameKind Kind,
+    const ASTContext &C, DeclContext *DC, SourceLocation L, int D, int P,
+    bool ParameterPack, IdentifierInfo *Id, TemplateNameKind Kind,
     bool Typename, TemplateParameterList *Params) {
   assert(!Params->empty() && "template with no template parameters");
   return new (C, DC) TemplateTemplateParmDecl(DC, L, D, P, ParameterPack, Id,
@@ -886,7 +885,7 @@ TemplateTemplateParmDecl *TemplateTemplateParmDecl::Create(
 
 TemplateTemplateParmDecl *
 TemplateTemplateParmDecl::Create(const ASTContext &C, DeclContext *DC,
-                                 SourceLocation L, unsigned D, unsigned P,
+                                 SourceLocation L, int D, int P,
                                  IdentifierInfo *Id, TemplateNameKind Kind,
                                  bool Typename, TemplateParameterList *Params,
                                  ArrayRef<TemplateParameterList *> Expansions) {
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index ac6dc5bcefb7e..c04fff6cbd964 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3005,7 +3005,9 @@ ConvertDeducedTemplateArgument(Sema &S, NamedDecl *Param,
         Sema::InstantiatingTemplate Inst(S, Template->getLocation(), Template,
                                          TTP, CTAI.SugaredConverted,
                                          Template->getSourceRange());
-        if (Inst.isInvalid() || !S.SubstDecl(TTP, S.CurContext, Args))
+        if (Inst.isInvalid() ||
+            !S.SubstTemplateParams(TTP->getTemplateParameters(), S.CurContext,
+                                   Args))
           return true;
       }
       // For type parameters, no substitution is ever required.
diff --git a/clang/test/SemaTemplate/concepts-lambda.cpp b/clang/test/SemaTemplate/concepts-lambda.cpp
index a583589340bd0..a08cd52843c73 100644
--- a/clang/test/SemaTemplate/concepts-lambda.cpp
+++ b/clang/test/SemaTemplate/concepts-lambda.cpp
@@ -158,18 +158,21 @@ static_assert(E<int>);  // previously Asserted.
 
 // ensure we properly diagnose when "D" is false.
 namespace DIsFalse {
-template<auto Q> concept C = requires { Q.template operator()<float>(); };
+template<auto Q> concept C = requires { Q.template operator()<float>(); }; // #GH60642-C
 template<class> concept D = false;
+// FIXME: Crashes because it produces a template type parameter with invalid depth
+#if 0
 static_assert(C<[]<D>{}>);
 // expected-error@-1{{static assertion failed}}
 // expected-note@-2{{does not satisfy 'C'}}
 // expected-note@-5{{because 'Q.template operator()<float>()' would be invalid: no matching member function for call to 'operator()'}}
+#endif
 template<class> concept E = C<[]<D>{}>;
 static_assert(E<int>);
 // expected-error@-1{{static assertion failed}}
 // expected-note@-2{{because 'int' does not satisfy 'E'}}
 // expected-note@-4{{does not satisfy 'C'}}
-// expected-note@-11{{because 'Q.template operator()<float>()' would be invalid: no matching member function for call to 'operator()'}}
+// expected-note@#GH60642-C{{because 'Q.template operator()<float>()' would be invalid: no matching member function for call to 'operator()'}}
 }
 }
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 72a2fab99c581..6ac934ee2a629 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1551,10 +1551,12 @@ template<generic_range_value<[]<
    >() {}> T>
 void x() {}
 
+// FIXME: Crashes because it produces a template type parameter with invalid depth
+#if 0
 void foo() {
   x<vector<int>>();
 }
-
+#endif
 }
 
 namespace GH162770 {

From 7bfb4d90fffc0188780bbf07c863a4b90cd7bd3b Mon Sep 17 00:00:00 2001
From: Mel Chen <mel.chen@sifive.com>
Date: Thu, 14 May 2026 23:12:54 +0800
Subject: [PATCH 95/95] [LV] Add store to test case to prevent dead code. nfc
 (#197703)

---
 .../LoopVectorize/pr39417-optsize-scevchecks.ll          | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
index 7d4c1d35ffc9b..612f4e1499de7 100644
--- a/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@@ -6,14 +6,17 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; PR39417
 ; Check that the need for overflow check prevents vectorizing a loop with tiny
 ; trip count (which implies opt for size).
-define void @func_34() {
-; CHECK-LABEL: define void @func_34() {
+define void @func_34(ptr %dst) {
+; CHECK-LABEL: define void @func_34(
+; CHECK-SAME: ptr [[DST:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[SEXT:%.*]] = shl i32 [[IV]], 16
 ; CHECK-NEXT:    [[STEP:%.*]] = ashr exact i32 [[SEXT]], 16
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[IV]]
+; CHECK-NEXT:    store i32 [[STEP]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nsw i32 [[STEP]], 1
 ; CHECK-NEXT:    [[IV_NEXT_TRUNC:%.*]] = trunc i32 [[IV_NEXT]] to i16
 ; CHECK-NEXT:    [[EC:%.*]] = icmp slt i16 [[IV_NEXT_TRUNC]], 3
@@ -28,6 +31,8 @@ loop:
   %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
   %sext = shl i32 %iv, 16
   %step = ashr exact i32 %sext, 16
+  %gep = getelementptr inbounds i32, ptr %dst, i32 %iv
+  store i32 %step, ptr %gep, align 4
   %iv.next = add nsw i32 %step, 1
   %iv.next.trunc = trunc i32 %iv.next to i16
   %ec = icmp slt i16 %iv.next.trunc, 3